From bc5592ea35ae9f3b905585954a79c6eb438262f5 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 17 Aug 2017 23:10:28 +0000 Subject: [PATCH 001/237] add two bit compression operator --- src/operator/contrib/two_bit_quantize-inl.h | 297 ++++++++++++++++++++ src/operator/contrib/two_bit_quantize.cc | 82 ++++++ src/operator/contrib/two_bit_quantize.cu | 36 +++ 3 files changed, 415 insertions(+) create mode 100644 src/operator/contrib/two_bit_quantize-inl.h create mode 100644 src/operator/contrib/two_bit_quantize.cc create mode 100644 src/operator/contrib/two_bit_quantize.cu diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h new file mode 100644 index 000000000000..2d9b20558226 --- /dev/null +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + /*! + * \file two_bit_quantize-inl.h + * \brief implementation of quantize_2bit operation + */ +#ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ +#define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ + +#include +#include +#include +#include "../elemwise_op_common.h" +#include "../mshadow_op.h" +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +#define TOTAL_BITS 32 + +struct init_mem_2bit { + // Initialize output array + MSHADOW_XINLINE static void Map(int i, float* out) { + *reinterpret_cast(out+i) = 0x00000000; + } +}; + +struct init_threshold_2bit { + MSHADOW_XINLINE static void Map(int i, + float *out, + const float *neg_threshold, + const float *pos_threshold) { + // The first two elments in output is threshold + out[0] = *neg_threshold; + out[1] = *pos_threshold; + } +}; + +struct quantize_2bit { + MSHADOW_XINLINE static void Map(int i, + float *out, + const float *in, + const float *neg_threshold, + const float *pos_threshold) { + // get block id + int block_id = i / 16; + char* ch_ptr = reinterpret_cast(out+block_id); + // get row ptr + int row_id = (i%16)/4; + ch_ptr += row_id; + // get column id + int col_id = (i%16)%4; + // Compress + if (*(in+i) <= *neg_threshold) { // set data to 01 + switch (col_id) { + case 0: + (*ch_ptr) |= 0x40; // binary: (01)00 0000 + break; + case 1: + (*ch_ptr) |= 0x10; // binary: 00(01) 0000 + break; + case 2: + (*ch_ptr) |= 0x04; // binary: 0000 (01)00 + break; + case 3: + (*ch_ptr) |= 0x01; // binary: 0000 00(01) + break; + default: + break; + } + } else if (*(in+i) >= *pos_threshold) { // set data to 10 + switch (col_id) { + case 0: + (*ch_ptr) |= 0x80; // binary: (10)00 0000 + break; + case 1: + (*ch_ptr) |= 0x20; // binary: 00(10) 0000 + break; + case 2: + (*ch_ptr) |= 0x08; // binary: 0000 (10)00 + break; + case 3: + (*ch_ptr) |= 0x02; // binary: 0000 00(10) + break; + default: + break; + } + } // else 00 (unchange) + } +}; + +template +void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + // For now, this method can only compress the float data + // First, init the memory of output to 0x00000000 + Kernel::Launch(s, outputs[0].Size(), + outputs[0].dptr()); + // Then, init threshold + Kernel::Launch(s, 1, + outputs[0].dptr(), + inputs[1].dptr(), + inputs[2].dptr()); + // Then, compress the data + Kernel::Launch(s, inputs[0].Size(), + outputs[0].dptr()+2, // output array + inputs[0].dptr(), // input array + inputs[1].dptr(), // negative threshold + inputs[2].dptr()); // positive threshold +} + +inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + // 1. negative threshold + // 2. positive threshold + CHECK_EQ(in_attrs->size(), 3U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK(!shape_is_none(in_attrs->at(0))); + for (size_t i = 1; i < 3; ++i) { + CHECK(shape_is_scalar(in_attrs->at(i))); + } + // check output + int shape = in_attrs->at(0).Size() % 16 == 0 ? + in_attrs->at(0).Size() / 16 + 2: + in_attrs->at(0).Size() / 16 + 3; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); + return true; +} + +inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + // 1. negative threshold + // 2. positive threshold + CHECK_EQ(in_attrs->size(), 3U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`quantize_2bit_` only supports float32 input for now"; + CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) + << "the second input of `quantize_2bit` should be " + << "a tensor with type of float"; + CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) + << "the third input of `quantize_2bit` should be " + << "a tensor with type of float"; + // check output + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); + return true; +} + +struct dequantize_2bit { + // Decompress + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + const float *neg_threshold, + const float *pos_threshold) { + // get block ptr + int block_id = i / 16; + char* ch_ptr = reinterpret_cast(in+block_id); + // get row ptr + int row_id = (i%16)/4; + ch_ptr += row_id; + // get column id + int col_id = (i%16)%4; + // Decompress + switch (col_id) { + case 0: + // positve + if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 1: + // positve + if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 2: + // positve + if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 3: + // positve + if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + default: + break; + } + } +}; + +template +void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + // For now, this method can only decompress the float data + Kernel::Launch(s, inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr()+2, // compressed array + inputs[0].dptr(), // negative threshold + inputs[0].dptr()+1); // positve threshold +} + +inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. compressed array + // 1. original array + CHECK_EQ(in_attrs->size(), 2U); + // No output + CHECK_EQ(out_attrs->size(), 0U); + // check input + CHECK(!shape_is_none(in_attrs->at(0))); + CHECK(!shape_is_none(in_attrs->at(1))); + return true; +} + +inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. compressed array + // 1. original array + CHECK_EQ(in_attrs->size(), 2U); + // No output + CHECK_EQ(out_attrs->size(), 0U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`dquantize_2bit_` only supports float32 input for now"; + CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) + << "`dquantize_2bit_` only supports float32 input for now"; + return true; +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc new file mode 100644 index 000000000000..5541f06bd55d --- /dev/null +++ b/src/operator/contrib/two_bit_quantize.cc @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file two_bit_quantize.cc + * \brief + */ +#include "./two_bit_quantize-inl.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(_contrib_quantize_2bit) +.describe(R"code(Quantize a input tensor using 2-bit compression with +user-specified threshold - 'pos_threshold' and 'neg_threshold'. + +For example, assume the input array is [-1.0, -5.0, -4.0, 2.0, 2.5, 7.5], +, and the negative threshold is -4.0 and the positive is +4.0 . In this method, +the elements >= pos_threshold will be compressed into a 2-bit data '01', which +is represented as the pos_threshold. The elements <= neg_threshold will be +compressed into a 2-bit data '10', which is represented as the neg_threshold. +The other elements will be compressed into '00', which is represented as zero. + +out = quantize_2bit(array, neg_threshold, pos_threshold) will return a +compressed array 'out' with 3 elements, in which the first element stores +the neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), +and the whole input array has been compressed into a single element +(the third element). + +Using 2-bit compression, every 16 elements will be compressed into +one float data. + +)code" ADD_FILELINE) +.set_num_inputs(3) +.set_num_outputs(1) +.set_attr("FInferShape", Quantize2BitShape) +.set_attr("FInferType", Quantize2BitType) +.set_attr("FCompute", Quantize2BitCompute) +.set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) +.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("neg_shreshold", "NDArray-or-Symbol", "The negative shreshold") +.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold"); + +NNVM_REGISTER_OP(_contrib_dequantize_2bit) +.describe(R"code(Dequantize a input tensor compressed by quantize_2bit. + +dequantize_2bit takes two input arguments. The first one is a NDArray been +generated by quantize_2bit() method. The second input is also a NDArray that +has the same size with the original array not been compressed. + +Using the example as was described above. + +dequantize_2bit(out, array), the array will become [0, -4.0, -4.0, 0, 0, 4.0] + +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(0) +.set_attr("FInferShape", Dequantize2BitShape) +.set_attr("FInferType", Dequantize2BitType) +.set_attr("FCompute", Dequantize2BitCompute) +.set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_2bit"}) +.add_argument("input_1", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("input_2", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu new file mode 100644 index 000000000000..d90f49da7e15 --- /dev/null +++ b/src/operator/contrib/two_bit_quantize.cu @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file two_bit_quantize_sim.cu + * \brief + */ +#include "./two_bit_quantize-inl.h" + +namespace mxnet { +namespace op { + +NNVM_REGISTER_OP(_contrib_quantize_2bit) +.set_attr("FCompute", Quantize2BitCompute); + +NNVM_REGISTER_OP(_contrib_dequantize_2bit) +.set_attr("FCompute", Dequantize2BitCompute); + +} // namespace op +} // namespace mxnet From cd43c3d7c52770e87cb331028f18d0805e8d8dd2 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 21 Aug 2017 22:39:51 +0000 Subject: [PATCH 002/237] update two bit compression --- src/operator/contrib/two_bit_quantize-inl.h | 70 ++++++++++++++------- src/operator/contrib/two_bit_quantize.cc | 46 +++++++------- tests/python/unittest/test_operator.py | 62 +++++++----------- 3 files changed, 94 insertions(+), 84 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 2d9b20558226..35074112b200 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -57,9 +57,12 @@ struct init_threshold_2bit { struct quantize_2bit { MSHADOW_XINLINE static void Map(int i, float *out, - const float *in, + float *grad, + float *residual, const float *neg_threshold, const float *pos_threshold) { + // Add residual to gradient + grad[i] += residual[i]; // get block id int block_id = i / 16; char* ch_ptr = reinterpret_cast(out+block_id); @@ -69,7 +72,9 @@ struct quantize_2bit { // get column id int col_id = (i%16)%4; // Compress - if (*(in+i) <= *neg_threshold) { // set data to 01 + if (grad[i] <= *neg_threshold) { // set data to 01 + // new residual + residual[i] = grad[i] - *neg_threshold; switch (col_id) { case 0: (*ch_ptr) |= 0x40; // binary: (01)00 0000 @@ -86,7 +91,8 @@ struct quantize_2bit { default: break; } - } else if (*(in+i) >= *pos_threshold) { // set data to 10 + } else if (grad[i] >= *pos_threshold) { // set data to 10 + residual[i] = grad[i] - *pos_threshold; switch (col_id) { case 0: (*ch_ptr) |= 0x80; // binary: (10)00 0000 @@ -103,7 +109,9 @@ struct quantize_2bit { default: break; } - } // else 00 (unchange) + } else { // else 00 + residual[i] = grad[i]; + } } }; @@ -113,45 +121,51 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { + // For now, this method can only compress the float data using namespace mshadow; using namespace mxnet_op; Stream *s = ctx.get_stream(); - // For now, this method can only compress the float data // First, init the memory of output to 0x00000000 Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr()); + outputs[0].dptr()); // output array // Then, init threshold Kernel::Launch(s, 1, - outputs[0].dptr(), - inputs[1].dptr(), - inputs[2].dptr()); - // Then, compress the data + outputs[0].dptr(), // output array + inputs[2].dptr(), // negative threshold + inputs[3].dptr()); // positive threshold + // Finally, compress the data and calculate new residual Kernel::Launch(s, inputs[0].Size(), outputs[0].dptr()+2, // output array inputs[0].dptr(), // input array - inputs[1].dptr(), // negative threshold - inputs[2].dptr()); // positive threshold + inputs[1].dptr(), // residual array + inputs[2].dptr(), // negative threshold + inputs[3].dptr()); // positive threshold } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { // 0. input array - // 1. negative threshold - // 2. positive threshold - CHECK_EQ(in_attrs->size(), 3U); + // 1. residual array + // 2. negative threshold + // 3. positive threshold + CHECK_EQ(in_attrs->size(), 4U); // 0. output array CHECK_EQ(out_attrs->size(), 1U); // check input CHECK(!shape_is_none(in_attrs->at(0))); - for (size_t i = 1; i < 3; ++i) { - CHECK(shape_is_scalar(in_attrs->at(i))); - } + CHECK(!shape_is_none(in_attrs->at(1))); + CHECK(shape_is_scalar(in_attrs->at(2))); + CHECK(shape_is_scalar(in_attrs->at(3))); + CHECK_EQ(in_attrs->at(0).Size(), + in_attrs->at(1).Size()); // check output int shape = in_attrs->at(0).Size() % 16 == 0 ? in_attrs->at(0).Size() / 16 + 2: in_attrs->at(0).Size() / 16 + 3; SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); + // new residual array will re-use the memory of + // the original residual array return true; } @@ -159,22 +173,28 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { // 0. input array - // 1. negative threshold - // 2. positive threshold - CHECK_EQ(in_attrs->size(), 3U); + // 1. residual array + // 2. negative threshold + // 3. positive threshold + CHECK_EQ(in_attrs->size(), 4U); // 0. output array + // 1. new residual CHECK_EQ(out_attrs->size(), 1U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) << "`quantize_2bit_` only supports float32 input for now"; CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "the second input of `quantize_2bit` should be " - << "a tensor with type of float"; + << "`quantize_2bit_` only supports float32 input for now"; CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) << "the third input of `quantize_2bit` should be " << "a tensor with type of float"; + CHECK_EQ((*in_attrs)[3], mshadow::kFloat32) + << "the fourth input of `quantize_2bit` should be " + << "a tensor with type of float"; // check output TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); + // new residual array will re-use the memory of + // the original residual array return true; } @@ -273,6 +293,10 @@ inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, // check input CHECK(!shape_is_none(in_attrs->at(0))); CHECK(!shape_is_none(in_attrs->at(1))); + CHECK_LE(in_attrs->at(1).Size(), + in_attrs->at(0).Size()*16) + << "The shape of the second input array are " + << "not equal to the original array."; return true; } diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 5541f06bd55d..6cdae44e0709 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -27,46 +27,48 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_contrib_quantize_2bit) -.describe(R"code(Quantize a input tensor using 2-bit compression with -user-specified threshold - 'pos_threshold' and 'neg_threshold'. +.describe(R"code(Quantize a input tensor using 2-bit compression with residual +array and user-specified threshold - 'pos_threshold' & 'neg_threshold'. -For example, assume the input array is [-1.0, -5.0, -4.0, 2.0, 2.5, 7.5], -, and the negative threshold is -4.0 and the positive is +4.0 . In this method, -the elements >= pos_threshold will be compressed into a 2-bit data '01', which -is represented as the pos_threshold. The elements <= neg_threshold will be -compressed into a 2-bit data '10', which is represented as the neg_threshold. -The other elements will be compressed into '00', which is represented as zero. +For example, assume the input array (gradient + residual) is +[-1.0, -5.0, -4.0, 2.0, 2.5, 7.5], and the threshold is -4.0 and +4.0, +respectively. In this method, the elements >= pos_threshold will be +compressed into a 2-bit data '01', and the elements <= neg_threshold will be +compressed into a 2-bit data '10'. The other elements will be compressed +into '00', which is represented as zero. -out = quantize_2bit(array, neg_threshold, pos_threshold) will return a -compressed array 'out' with 3 elements, in which the first element stores -the neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), -and the whole input array has been compressed into a single element -(the third element). - -Using 2-bit compression, every 16 elements will be compressed into -one float data. +In this example, +out = quantize_2bit(array, residual, neg_threshold, pos_threshold) +will return a compressed array 'out' with 3 elements. The first element +stores the neg_threshold (-0.4) and the second element stores the pos_threshold +(+0.4), and the original array will be compressed into a single element in +the third element. In two bit compress, every 16 float data in original array +will be packed into one float data in output array. )code" ADD_FILELINE) -.set_num_inputs(3) +.set_num_inputs(4) .set_num_outputs(1) .set_attr("FInferShape", Quantize2BitShape) .set_attr("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) -.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("neg_shreshold", "NDArray-or-Symbol", "The negative shreshold") .add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold"); NNVM_REGISTER_OP(_contrib_dequantize_2bit) .describe(R"code(Dequantize a input tensor compressed by quantize_2bit. -dequantize_2bit takes two input arguments. The first one is a NDArray been -generated by quantize_2bit() method. The second input is also a NDArray that -has the same size with the original array not been compressed. +The dequantize_2bit takes two input arguments. The first input is a NDArray, +which has been generated by quantize_2bit(). The second input is also a +NDArray that has the same shape with the original array before compressing. Using the example as was described above. -dequantize_2bit(out, array), the array will become [0, -4.0, -4.0, 0, 0, 4.0] +Invoke dequantize_2bit(out, array), the 'array' argument will become +[0, -4.0, -4.0, 0, 0, 4.0], where -4.0 is the negative threshold and 4.0 is +the positive threshold. )code" ADD_FILELINE) .set_num_inputs(2) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index a33cb039c849..5308260d60bc 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -955,44 +955,6 @@ def test_convolution_grouping(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) - -def test_depthwise_convolution(): - for num_base in [32, 64]: - for kernel in [(3,3), (5,5)]: - for stride in [(1,1), (2,2)]: - for pad in [(0,0), (1,1)]: - num_filter = num_base - num_group = num_base - shape = (2, num_base, 32, 32) - - x = mx.sym.Variable('x') - w = mx.sym.Variable('w') - b = mx.sym.Variable('b') - y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, - kernel=kernel, stride=stride, pad=pad) - xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1) - wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0) - bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0) - y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i], - num_filter=num_filter//num_group, kernel=kernel, - stride=stride, pad=pad) - for i in range(num_group)]) - - dev = default_context() - exe1 = y1.simple_bind(dev, x=shape) - exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]), - b=(num_filter,)) - for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays): - arr1[:] = np.random.normal(size=arr1.shape) - arr2[:] = arr1 - exe1.forward(is_train=True) - exe1.backward(exe1.outputs[0]) - exe2.forward(is_train=True) - exe2.backward(exe2.outputs[0]) - - for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): - np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) - def gen_broadcast_data(idx): # Manually set test cases binary_op_data_shape = np.array( @@ -3297,6 +3259,21 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) +def test_quantization_2bit_op(): + array = mx.nd.array([-6, -2, 3, 1, 10, 5, -3, 2, -8, 0]) + residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) + neg_threshold = mx.nd.array([-4.0]) + pos_threshold = mx.nd.array([4.0]) + out = mx.contrib.ndarray.quantize_2bit(array, + residual, + neg_threshold, + pos_threshold) + mx.contrib.ndarray.dequantize_2bit(out, array) + out_real = mx.nd.array([-4, 0, 0, 4, 4, 4, 0, -4, -4, -4]) + res_real = mx.nd.array([-5, -1, 2, 2, 4, 3, 0, -1, -6, -96]) + assert same(out.asnumpy(), out_real.asnumpy()) + assert same(residual.asnumpy(), res_real.asnumpy()) + def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors @@ -3450,8 +3427,15 @@ def test_deformable_psroipooling(): def test_laop(): + return + + # Currently no support for GPU. Will be added soon + # so keep these tests here in this file and activate + # gpu-testing when it is ready. + dev = default_context() + if dev.device_type == 'gpu': + return - # enable numerical checking of gradients grad_check = 1 data1 = mx.symbol.Variable('data1') From cf370194155025ce24178a2505ee78b9f5981b40 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 21 Aug 2017 22:56:36 +0000 Subject: [PATCH 003/237] update two bit compression --- tests/python/unittest/test_operator.py | 51 ++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 5308260d60bc..99fb842e5cf8 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -954,6 +954,42 @@ def test_convolution_grouping(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) +def test_depthwise_convolution(): + for num_base in [32, 64]: + for kernel in [(3,3), (5,5)]: + for stride in [(1,1), (2,2)]: + for pad in [(0,0), (1,1)]: + num_filter = num_base + num_group = num_base + shape = (2, num_base, 32, 32) + + x = mx.sym.Variable('x') + w = mx.sym.Variable('w') + b = mx.sym.Variable('b') + y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, + kernel=kernel, stride=stride, pad=pad) + xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1) + wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0) + bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0) + y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i], + num_filter=num_filter//num_group, kernel=kernel, + stride=stride, pad=pad) + for i in range(num_group)]) + + dev = default_context() + exe1 = y1.simple_bind(dev, x=shape) + exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]), + b=(num_filter,)) + for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays): + arr1[:] = np.random.normal(size=arr1.shape) + arr2[:] = arr1 + exe1.forward(is_train=True) + exe1.backward(exe1.outputs[0]) + exe2.forward(is_train=True) + exe2.backward(exe2.outputs[0]) + + for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): + np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) def gen_broadcast_data(idx): # Manually set test cases @@ -3264,14 +3300,14 @@ def test_quantization_2bit_op(): residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) neg_threshold = mx.nd.array([-4.0]) pos_threshold = mx.nd.array([4.0]) - out = mx.contrib.ndarray.quantize_2bit(array, - residual, - neg_threshold, - pos_threshold) - mx.contrib.ndarray.dequantize_2bit(out, array) - out_real = mx.nd.array([-4, 0, 0, 4, 4, 4, 0, -4, -4, -4]) + out = mx.contrib.nd.quantize_2bit(array, + residual, + neg_threshold, + pos_threshold) + mx.contrib.nd.dequantize_2bit(out, array) + array_real = mx.nd.array([-4, 0, 0, 4, 4, 4, 0, -4, -4, -4]) res_real = mx.nd.array([-5, -1, 2, 2, 4, 3, 0, -1, -6, -96]) - assert same(out.asnumpy(), out_real.asnumpy()) + assert same(array.asnumpy(), array_real.asnumpy()) assert same(residual.asnumpy(), res_real.asnumpy()) def test_reciprocal_op(): @@ -3436,6 +3472,7 @@ def test_laop(): if dev.device_type == 'gpu': return + # enable numerical checking of gradients grad_check = 1 data1 = mx.symbol.Variable('data1') From 15a3f501c261e5e7ec2f8eb575c2a501e5f72d98 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 21 Aug 2017 23:05:12 +0000 Subject: [PATCH 004/237] update two bit compress --- tests/python/unittest/test_operator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 99fb842e5cf8..dba6fa20f344 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -954,6 +954,8 @@ def test_convolution_grouping(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) + + def test_depthwise_convolution(): for num_base in [32, 64]: for kernel in [(3,3), (5,5)]: From aa47f47c0b6b2e2e8469ab1543bc9ab85da1405e Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 21 Aug 2017 23:11:09 +0000 Subject: [PATCH 005/237] update two bit compression --- tests/python/unittest/test_operator.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index dba6fa20f344..ab1df66ca15f 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -955,7 +955,6 @@ def test_convolution_grouping(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) - def test_depthwise_convolution(): for num_base in [32, 64]: for kernel in [(3,3), (5,5)]: @@ -3465,14 +3464,6 @@ def test_deformable_psroipooling(): def test_laop(): - return - - # Currently no support for GPU. Will be added soon - # so keep these tests here in this file and activate - # gpu-testing when it is ready. - dev = default_context() - if dev.device_type == 'gpu': - return # enable numerical checking of gradients grad_check = 1 From 5fa6d6476dbd7a088bc8e89ff1abc0da630ac205 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 21 Aug 2017 23:31:33 +0000 Subject: [PATCH 006/237] update two bit compression --- tests/python/unittest/test_operator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index ab1df66ca15f..06a1d12227c5 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -955,6 +955,7 @@ def test_convolution_grouping(): for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays): np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4) + def test_depthwise_convolution(): for num_base in [32, 64]: for kernel in [(3,3), (5,5)]: @@ -3301,10 +3302,7 @@ def test_quantization_2bit_op(): residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) neg_threshold = mx.nd.array([-4.0]) pos_threshold = mx.nd.array([4.0]) - out = mx.contrib.nd.quantize_2bit(array, - residual, - neg_threshold, - pos_threshold) + out = mx.contrib.nd.quantize_2bit(array, residual, neg_threshold, pos_threshold) mx.contrib.nd.dequantize_2bit(out, array) array_real = mx.nd.array([-4, 0, 0, 4, 4, 4, 0, -4, -4, -4]) res_real = mx.nd.array([-5, -1, 2, 2, 4, 3, 0, -1, -6, -96]) From 6005ba9d61723d2ecaf358a021710f91d04a2e97 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 22 Aug 2017 16:45:48 +0000 Subject: [PATCH 007/237] update two bit compression --- src/operator/contrib/two_bit_quantize-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 35074112b200..b91991739d1b 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -127,7 +127,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, Stream *s = ctx.get_stream(); // First, init the memory of output to 0x00000000 Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr()); // output array + outputs[0].dptr()); // output array // Then, init threshold Kernel::Launch(s, 1, outputs[0].dptr(), // output array From 18bbec1d0e691063a588cbd3975bb79f063b24c2 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 22 Aug 2017 23:20:08 +0000 Subject: [PATCH 008/237] update two bit compression --- src/operator/contrib/two_bit_quantize-inl.h | 3 --- tests/python/unittest/test_operator.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index b91991739d1b..e1a6f39691ea 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -34,8 +34,6 @@ namespace mxnet { namespace op { -#define TOTAL_BITS 32 - struct init_mem_2bit { // Initialize output array MSHADOW_XINLINE static void Map(int i, float* out) { @@ -178,7 +176,6 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, // 3. positive threshold CHECK_EQ(in_attrs->size(), 4U); // 0. output array - // 1. new residual CHECK_EQ(out_attrs->size(), 1U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 06a1d12227c5..ae207ed5f8a1 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3302,10 +3302,13 @@ def test_quantization_2bit_op(): residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) neg_threshold = mx.nd.array([-4.0]) pos_threshold = mx.nd.array([4.0]) + out = mx.contrib.nd.quantize_2bit(array, residual, neg_threshold, pos_threshold) mx.contrib.nd.dequantize_2bit(out, array) - array_real = mx.nd.array([-4, 0, 0, 4, 4, 4, 0, -4, -4, -4]) - res_real = mx.nd.array([-5, -1, 2, 2, 4, 3, 0, -1, -6, -96]) + + array_real = mx.nd.array([-4., 0., 0., 4., 4., 4., 0., -4., -4., -4.]) + res_real = mx.nd.array([-5., -1., 2., 2., 4., 3., 0., -1., -6., -96.]) + assert same(array.asnumpy(), array_real.asnumpy()) assert same(residual.asnumpy(), res_real.asnumpy()) From 534cee1529145eda2b6e283b10581fb5590b1489 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 23 Aug 2017 16:56:20 +0000 Subject: [PATCH 009/237] update two bit compression --- tests/python/unittest/test_operator.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index ae207ed5f8a1..bffa7c20236e 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3283,7 +3283,6 @@ def test_ctc_loss(): true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch check_ctc_loss(acts2, labels2, true_loss) - def test_quantization_op(): min0 = mx.nd.array([0.0]) max0 = mx.nd.array([1.0]) @@ -3297,21 +3296,6 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) -def test_quantization_2bit_op(): - array = mx.nd.array([-6, -2, 3, 1, 10, 5, -3, 2, -8, 0]) - residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) - neg_threshold = mx.nd.array([-4.0]) - pos_threshold = mx.nd.array([4.0]) - - out = mx.contrib.nd.quantize_2bit(array, residual, neg_threshold, pos_threshold) - mx.contrib.nd.dequantize_2bit(out, array) - - array_real = mx.nd.array([-4., 0., 0., 4., 4., 4., 0., -4., -4., -4.]) - res_real = mx.nd.array([-5., -1., 2., 2., 4., 3., 0., -1., -6., -96.]) - - assert same(array.asnumpy(), array_real.asnumpy()) - assert same(residual.asnumpy(), res_real.asnumpy()) - def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors From bd513a223897ca5bec07de423d20d18da58eb59b Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 23 Aug 2017 18:43:35 +0000 Subject: [PATCH 010/237] update two bit compression --- tests/python/unittest/test_operator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index bffa7c20236e..a33cb039c849 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3283,6 +3283,7 @@ def test_ctc_loss(): true_loss = np.array([7.3557, 5.4091], dtype=np.float32) # from Torch check_ctc_loss(acts2, labels2, true_loss) + def test_quantization_op(): min0 = mx.nd.array([0.0]) max0 = mx.nd.array([1.0]) From c5b1d2f523499c173ccaf88bc8930903de09f222 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 24 Aug 2017 23:32:43 +0000 Subject: [PATCH 011/237] update two bit compression --- src/operator/contrib/two_bit_quantize-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index e1a6f39691ea..53c9df7e7913 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -307,9 +307,9 @@ inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, CHECK_EQ(out_attrs->size(), 0U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`dquantize_2bit_` only supports float32 input for now"; + << "`dequantize_2bit_` only supports float32 input for now"; CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "`dquantize_2bit_` only supports float32 input for now"; + << "`dequantize_2bit_` only supports float32 input for now"; return true; } From 461f899a94fddf0e42c50d98c9176469d989af8a Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 25 Aug 2017 21:15:57 +0000 Subject: [PATCH 012/237] add test case for two bit compression --- tests/python/unittest/test_operator.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index a33cb039c849..30449c6badc1 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3297,6 +3297,18 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) +def test_quantization_2bit_op(): + array = mx.nd.array([-6, -2, 3, 1, 10, 5, -3, 2, -8, 0]) + residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) + neg_threshold = mx.nd.array([-4.0]) + pos_threshold = mx.nd.array([4.0]) + + out = mx.contrib.nd.quantize_2bit(array, residual, neg_threshold, pos_threshold) + mx.contrib.nd.dequantize_2bit(out, array) + + array_real = mx.nd.array([-4., 0., 0., 4., 4., 4., 0., -4., -4., -4.]) + res_real = mx.nd.array([-5., -1., 2., 2., 4., 3., 0., -1., -6., -96.]) + def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors From 407c01a829505a8d2f9f3c6ee520d51c58ecda30 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 29 Aug 2017 17:42:42 +0000 Subject: [PATCH 013/237] update two bit compression --- python/mxnet/gluon/trainer.py | 1 + src/operator/contrib/two_bit_quantize-inl.h | 84 +++++++++++++++------ src/operator/contrib/two_bit_quantize.cc | 42 +++++++---- src/operator/contrib/two_bit_quantize.cu | 3 + tests/python/unittest/test_operator.py | 11 --- 5 files changed, 92 insertions(+), 49 deletions(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index bb2cc763b5ba..6e8c38bb37ef 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -147,6 +147,7 @@ def step(self, batch_size, ignore_stale_grad=False): %(param.name, str(data.context))) if self._kvstore: + print param.list_grad(); self._kvstore.push(i, param.list_grad(), priority=-i) if self._update_on_kvstore: self._kvstore.pull(i, param.list_data(), priority=-i) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 53c9df7e7913..e503a3e21373 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -41,6 +41,52 @@ struct init_mem_2bit { } }; +template +void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + // For now, this method can only compress the float data + using namespace mshadow; + using namespace mxnet_op; + Stream *s = ctx.get_stream(); + // Init the memory of output to 0x00000000 + Kernel::Launch(s, outputs[0].Size(), + outputs[0].dptr()); // compressed array +} + +inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + CHECK_EQ(in_attrs->size(), 1U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK(!shape_is_none(in_attrs->at(0))); + // output + int shape = in_attrs->at(0).Size() % 16 == 0 ? + in_attrs->at(0).Size() / 16 + 2: + in_attrs->at(0).Size() / 16 + 3; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); + return true; +} + +inline bool Create2BitArray2BitType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + CHECK_EQ(in_attrs->size(), 1U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`create_2bit_` only supports float32 input for now"; + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); + return true; +} + struct init_threshold_2bit { MSHADOW_XINLINE static void Map(int i, float *out, @@ -124,16 +170,16 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, using namespace mxnet_op; Stream *s = ctx.get_stream(); // First, init the memory of output to 0x00000000 - Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr()); // output array + Kernel::Launch(s, inputs[4].Size(), + inputs[4].dptr()); // compressed array // Then, init threshold Kernel::Launch(s, 1, - outputs[0].dptr(), // output array + inputs[4].dptr(), // compressed array inputs[2].dptr(), // negative threshold inputs[3].dptr()); // positive threshold // Finally, compress the data and calculate new residual Kernel::Launch(s, inputs[0].Size(), - outputs[0].dptr()+2, // output array + inputs[4].dptr()+2, // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array inputs[2].dptr(), // negative threshold @@ -147,23 +193,20 @@ inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, // 1. residual array // 2. negative threshold // 3. positive threshold - CHECK_EQ(in_attrs->size(), 4U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); - // check input + // 4. compressed array + CHECK_EQ(in_attrs->size(), 5U); CHECK(!shape_is_none(in_attrs->at(0))); CHECK(!shape_is_none(in_attrs->at(1))); - CHECK(shape_is_scalar(in_attrs->at(2))); - CHECK(shape_is_scalar(in_attrs->at(3))); CHECK_EQ(in_attrs->at(0).Size(), in_attrs->at(1).Size()); - // check output + CHECK(shape_is_scalar(in_attrs->at(2))); + CHECK(shape_is_scalar(in_attrs->at(3))); int shape = in_attrs->at(0).Size() % 16 == 0 ? in_attrs->at(0).Size() / 16 + 2: in_attrs->at(0).Size() / 16 + 3; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); - // new residual array will re-use the memory of - // the original residual array + CHECK_EQ(in_attrs->at(4).Size(), shape) + << "The size of output array is not equal to " + << "the size of compressed array"; return true; } @@ -174,9 +217,8 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, // 1. residual array // 2. negative threshold // 3. positive threshold - CHECK_EQ(in_attrs->size(), 4U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); + // 4. compressed array + CHECK_EQ(in_attrs->size(), 5U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) << "`quantize_2bit_` only supports float32 input for now"; @@ -188,10 +230,8 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, CHECK_EQ((*in_attrs)[3], mshadow::kFloat32) << "the fourth input of `quantize_2bit` should be " << "a tensor with type of float"; - // check output - TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); - // new residual array will re-use the memory of - // the original residual array + CHECK_EQ((*in_attrs)[4], mshadow::kFloat32) + << "`quantize_2bit_` only supports float32 input for now"; return true; } @@ -303,8 +343,6 @@ inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, // 0. compressed array // 1. original array CHECK_EQ(in_attrs->size(), 2U); - // No output - CHECK_EQ(out_attrs->size(), 0U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) << "`dequantize_2bit_` only supports float32 input for now"; diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 6cdae44e0709..28968583a471 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -28,26 +28,28 @@ namespace op { NNVM_REGISTER_OP(_contrib_quantize_2bit) .describe(R"code(Quantize a input tensor using 2-bit compression with residual -array and user-specified threshold - 'pos_threshold' & 'neg_threshold'. +array and user-specified threshold. -For example, assume the input array (gradient + residual) is -[-1.0, -5.0, -4.0, 2.0, 2.5, 7.5], and the threshold is -4.0 and +4.0, -respectively. In this method, the elements >= pos_threshold will be +For example, assume the input array (gradient) is [-1.0, -5.0, -4.0], and the +residual is [-2.0, 0, 1.0], and the threshold is -4.0 and +4.0, respectively. +In this method, the elements (gradient + residual) >= pos_threshold will be compressed into a 2-bit data '01', and the elements <= neg_threshold will be compressed into a 2-bit data '10'. The other elements will be compressed into '00', which is represented as zero. -In this example, -out = quantize_2bit(array, residual, neg_threshold, pos_threshold) -will return a compressed array 'out' with 3 elements. The first element -stores the neg_threshold (-0.4) and the second element stores the pos_threshold -(+0.4), and the original array will be compressed into a single element in +In this example, invoke +quantize_2bit(array, residual, neg_threshold, pos_threshold, out), the 'out' +will be the compressed array. Note that, the out array can be generated by +invoking create_2bit(array). + +In this example, the 'out' has 3 elements. The first element stores the +neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), +and the original array will be compressed into a single element in the third element. In two bit compress, every 16 float data in original array will be packed into one float data in output array. - )code" ADD_FILELINE) -.set_num_inputs(4) -.set_num_outputs(1) +.set_num_inputs(5) +.set_num_outputs(0) .set_attr("FInferShape", Quantize2BitShape) .set_attr("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) @@ -55,8 +57,20 @@ will be packed into one float data in output array. .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("neg_shreshold", "NDArray-or-Symbol", "The negative shreshold") +.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold"); +NNVM_REGISTER_OP(_contrib_create_2bit) +.describe(R"code(Tp generate a compressed array with right shape. +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr("FInferShape", Create2BitArrayShape) +.set_attr("FInferType", Create2BitArray2BitType) +.set_attr("FCompute", Create2BitArrayCompute) +.set_attr("FGradient", ElemwiseGradUseNone{"_create_2bit"}) +.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + NNVM_REGISTER_OP(_contrib_dequantize_2bit) .describe(R"code(Dequantize a input tensor compressed by quantize_2bit. @@ -67,9 +81,7 @@ NDArray that has the same shape with the original array before compressing. Using the example as was described above. Invoke dequantize_2bit(out, array), the 'array' argument will become -[0, -4.0, -4.0, 0, 0, 4.0], where -4.0 is the negative threshold and 4.0 is -the positive threshold. - +[0, -4.0, 0], where -4.0 is the negative threshold. )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(0) diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu index d90f49da7e15..20f21a295ebd 100644 --- a/src/operator/contrib/two_bit_quantize.cu +++ b/src/operator/contrib/two_bit_quantize.cu @@ -32,5 +32,8 @@ NNVM_REGISTER_OP(_contrib_quantize_2bit) NNVM_REGISTER_OP(_contrib_dequantize_2bit) .set_attr("FCompute", Dequantize2BitCompute); +NNVM_REGISTER_OP(_contrib_create_2bit) +.set_attr("FCompute", Create2BitArrayCompute); + } // namespace op } // namespace mxnet diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 30449c6badc1..e3ee96c2c167 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3297,17 +3297,6 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) -def test_quantization_2bit_op(): - array = mx.nd.array([-6, -2, 3, 1, 10, 5, -3, 2, -8, 0]) - residual = mx.nd.array([-3, 1, -1, 5, -2, 2, 3, -7, -2, -100]) - neg_threshold = mx.nd.array([-4.0]) - pos_threshold = mx.nd.array([4.0]) - - out = mx.contrib.nd.quantize_2bit(array, residual, neg_threshold, pos_threshold) - mx.contrib.nd.dequantize_2bit(out, array) - - array_real = mx.nd.array([-4., 0., 0., 4., 4., 4., 0., -4., -4., -4.]) - res_real = mx.nd.array([-5., -1., 2., 2., 4., 3., 0., -1., -6., -96.]) def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 From 8cbb7f6af58006bda0f6a18261a8ad0756dba5e0 Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Tue, 29 Aug 2017 10:57:44 -0700 Subject: [PATCH 014/237] Update trainer.py --- python/mxnet/gluon/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 6e8c38bb37ef..bb2cc763b5ba 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -147,7 +147,6 @@ def step(self, batch_size, ignore_stale_grad=False): %(param.name, str(data.context))) if self._kvstore: - print param.list_grad(); self._kvstore.push(i, param.list_grad(), priority=-i) if self._update_on_kvstore: self._kvstore.pull(i, param.list_data(), priority=-i) From 0dd187477b4d7a7ebf3017aed1d39e332cefd8da Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Tue, 29 Aug 2017 10:58:09 -0700 Subject: [PATCH 015/237] Update test_operator.py --- tests/python/unittest/test_operator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index e3ee96c2c167..a33cb039c849 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3297,7 +3297,6 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) - def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors From bbd21e4dba161cff1db51b08296b90ae9598088c Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 31 Aug 2017 17:59:24 +0000 Subject: [PATCH 016/237] update two bit compression --- include/mxnet/c_api.h | 2 ++ python/mxnet/gluon/trainer.py | 14 +++++++++++--- python/mxnet/kvstore.py | 7 ++++++- python/mxnet/model.py | 6 ++++-- 4 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 2289354e8a5e..253dd05f19e7 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1375,10 +1375,12 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars, /*! * \brief Create a kvstore * \param type the type of KVStore +* \param compress whether using low-bit compression * \param out The output type of KVStore * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreCreate(const char *type, + const char *compress, KVStoreHandle *out); /*! * \brief Delete a KVStore handle. diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index bb2cc763b5ba..1860625f51af 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -43,8 +43,10 @@ class Trainer(object): kvstore : str or KVStore kvstore type for multi-gpu and distributed training. See help on :any:`mxnet.kvstore.create` for more information. + compress : str + whether using low-bit compression. The argument can be 'none', '2bit', and '1bit'. """ - def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'): + def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', compress='none'): if isinstance(params, (dict, ParameterDict)): params = list(params.values()) if not isinstance(params, (list, tuple)): @@ -58,13 +60,17 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'): "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) self._params.append(param) - + if (compress != 'none' and + compress != '2bit' and compress != '1bit'): + raise ValueError("The compress argument can only be 'none', " \ + "'2bit', or '1bit'.") optimizer_params = optimizer_params if optimizer_params else {} self._scale = optimizer_params.get('rescale_grad', 1.0) self._contexts = self._check_contexts() self._init_optimizer(optimizer, optimizer_params) self._kv_initialized = False self._kvstore = kvstore + self._compress = compress def _check_contexts(self): contexts = None @@ -94,7 +100,9 @@ def _init_optimizer(self, optimizer, optimizer_params): def _init_kvstore(self): arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} - kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), + kvstore, update_on_kvstore = _create_kvstore(self._kvstore, + self._compress, + len(self._contexts), arg_arrays) if kvstore: if 'dist' in kvstore.type: diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index fd0091182aea..ec09cca27477 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -408,7 +408,7 @@ def _send_command_to_servers(self, head, body): check_call(_LIB.MXKVStoreSendCommmandToServers( self.handle, mx_uint(head), c_str(body))) -def create(name='local'): +def create(name='local', compress='none'): """Creates a new KVStore. For single machine training, there are two commonly used types: @@ -438,6 +438,8 @@ def create(name='local'): ---------- name : {'local', 'device', 'dist_sync', 'dist_device_sync', 'dist_async'} The type of KVStore. + compress : {'none', '2bit', '1bit'} + Whether using low-bit compression. Returns ------- kv : KVStore @@ -445,7 +447,10 @@ def create(name='local'): """ if not isinstance(name, string_types): raise TypeError('name must be a string') + if not isinstance(compress, string_types): + raise TypeError('compress must be a string') handle = KVStoreHandle() check_call(_LIB.MXKVStoreCreate(c_str(name), + c_str(compress), ctypes.byref(handle))) return KVStore(handle) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 01b3fa50e18f..fa1a14c8d651 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -54,7 +54,7 @@ 'eval_metric', 'locals']) -def _create_kvstore(kvstore, num_device, arg_params): +def _create_kvstore(kvstore, compress, num_device, arg_params): """Create kvstore This function select and create a proper kvstore if given the kvstore type. @@ -62,6 +62,8 @@ def _create_kvstore(kvstore, num_device, arg_params): ---------- kvstore : KVStore or str The kvstore. + compress : str + Whether using low-bit compression. num_device : int The number of devices arg_params : dict of str to `NDArray`. @@ -78,7 +80,7 @@ def _create_kvstore(kvstore, num_device, arg_params): # no need to use kv for single device and single machine kv = None else: - kv = kvs.create(kvstore) + kv = kvs.create(kvstore, compress) if kvstore == 'local': # automatically select a proper local max_size = max(np.prod(param.shape) for param in From 72640e90523787a94be0aca7d1254ea08b227d6b Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 31 Aug 2017 18:26:28 +0000 Subject: [PATCH 017/237] update two bit compression --- include/mxnet/c_api.h | 2 -- python/mxnet/gluon/trainer.py | 1 - python/mxnet/kvstore.py | 7 +------ python/mxnet/model.py | 6 ++---- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 253dd05f19e7..2289354e8a5e 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1375,12 +1375,10 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars, /*! * \brief Create a kvstore * \param type the type of KVStore -* \param compress whether using low-bit compression * \param out The output type of KVStore * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreCreate(const char *type, - const char *compress, KVStoreHandle *out); /*! * \brief Delete a KVStore handle. diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 1860625f51af..8322b07697a6 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -101,7 +101,6 @@ def _init_optimizer(self, optimizer, optimizer_params): def _init_kvstore(self): arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} kvstore, update_on_kvstore = _create_kvstore(self._kvstore, - self._compress, len(self._contexts), arg_arrays) if kvstore: diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index ec09cca27477..fd0091182aea 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -408,7 +408,7 @@ def _send_command_to_servers(self, head, body): check_call(_LIB.MXKVStoreSendCommmandToServers( self.handle, mx_uint(head), c_str(body))) -def create(name='local', compress='none'): +def create(name='local'): """Creates a new KVStore. For single machine training, there are two commonly used types: @@ -438,8 +438,6 @@ def create(name='local', compress='none'): ---------- name : {'local', 'device', 'dist_sync', 'dist_device_sync', 'dist_async'} The type of KVStore. - compress : {'none', '2bit', '1bit'} - Whether using low-bit compression. Returns ------- kv : KVStore @@ -447,10 +445,7 @@ def create(name='local', compress='none'): """ if not isinstance(name, string_types): raise TypeError('name must be a string') - if not isinstance(compress, string_types): - raise TypeError('compress must be a string') handle = KVStoreHandle() check_call(_LIB.MXKVStoreCreate(c_str(name), - c_str(compress), ctypes.byref(handle))) return KVStore(handle) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index fa1a14c8d651..01b3fa50e18f 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -54,7 +54,7 @@ 'eval_metric', 'locals']) -def _create_kvstore(kvstore, compress, num_device, arg_params): +def _create_kvstore(kvstore, num_device, arg_params): """Create kvstore This function select and create a proper kvstore if given the kvstore type. @@ -62,8 +62,6 @@ def _create_kvstore(kvstore, compress, num_device, arg_params): ---------- kvstore : KVStore or str The kvstore. - compress : str - Whether using low-bit compression. num_device : int The number of devices arg_params : dict of str to `NDArray`. @@ -80,7 +78,7 @@ def _create_kvstore(kvstore, compress, num_device, arg_params): # no need to use kv for single device and single machine kv = None else: - kv = kvs.create(kvstore, compress) + kv = kvs.create(kvstore) if kvstore == 'local': # automatically select a proper local max_size = max(np.prod(param.shape) for param in From aaafa84490333639ae9f1944422c91373353475d Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 31 Aug 2017 18:43:42 +0000 Subject: [PATCH 018/237] update two bit compression --- include/mxnet/kvstore.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index d2924ecea1b5..0d1dd3375c84 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -63,6 +63,13 @@ class KVStore { */ inline const std::string& type() { return type_; } + /** + * \brief set to use low-bit compression + */ + inline void SetCompress(const std::string& compress) { + compress_ = compress; + } + /*! * \brief Initialize a list of key-value pair to the store. * @@ -341,6 +348,11 @@ class KVStore { */ std::string type_; + /** + * \brief whether using low-bit compression + */ + std::string compress_ = "none"; + /** * \brief whether to do barrier when finalize */ From 2d85430c8bfd8d024d46bde17aa32eb82a524b44 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 31 Aug 2017 22:23:47 +0000 Subject: [PATCH 019/237] update --- include/mxnet/c_api.h | 10 ++++++++++ python/mxnet/gluon/trainer.py | 1 + python/mxnet/kvstore.py | 10 ++++++++++ src/c_api/c_api.cc | 7 +++++++ src/kvstore/comm.h | 14 +++++++++++--- src/kvstore/kvstore_local.h | 4 +++- 6 files changed, 42 insertions(+), 4 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 2289354e8a5e..5c148234a397 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1380,6 +1380,16 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars, */ MXNET_DLL int MXKVStoreCreate(const char *type, KVStoreHandle *out); + +/*! + * \brief Delete a KVStore handle. + * \param handle handle to the kvstore + * \param compress set to use low-bit compression + * \return 0 when success, -1 when failure happens + */ +MXNET_DLL int MXKVStoreSetCompress(KVStoreHandle handle, + const char *compress); + /*! * \brief Delete a KVStore handle. * \param handle handle to the kvstore diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 8322b07697a6..d2720886a574 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -104,6 +104,7 @@ def _init_kvstore(self): len(self._contexts), arg_arrays) if kvstore: + kvstore.set_compress(self._compress) if 'dist' in kvstore.type: update_on_kvstore = False for i, param in enumerate(self._params): diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index fd0091182aea..95d678649766 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -236,6 +236,16 @@ def pull(self, key, out=None, priority=0): self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority))) + def set_compress(self, compress='none'): + """ Set to use low-bit compression + + compress can be 'none', '2bit', or '1bit'. + """ + if not isinstance(compress, string_types): + raise TypeError('compress must be a string') + check_call(_LIB.MXKVStoreSetCompress(self.handle, + c_str(compress))) + def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 93458d21ac5a..d70f95540419 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -635,6 +635,13 @@ int MXKVStoreCreate(const char *type, API_END(); } +int MXKVStoreSetCompress(KVStoreHandle handle, + const char *compress) { + API_BEGIN(); + static_cast(handle)->SetCompress(compress); + API_END(); +} + int MXKVStoreFree(KVStoreHandle handle) { API_BEGIN(); delete static_cast(handle); diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index ade9c95feda7..b6ea293ec850 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -42,7 +42,9 @@ class Comm { /** * \brief init key with the data shape */ - virtual void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) = 0; + virtual void Init(int key, const TShape& shape, + const std::string& compress, + int dtype = mshadow::kFloat32) = 0; /** * \brief returns src[0] + .. + src[src.size()-1] */ @@ -64,6 +66,7 @@ class Comm { protected: Context pinned_ctx_; + std::string compress_; }; /** @@ -78,7 +81,9 @@ class CommCPU : public Comm { } virtual ~CommCPU() { } - void Init(int key, const TShape& shape, int type = mshadow::kFloat32) override { + void Init(int key, const TShape& shape, + const std::string& compress, + int type = mshadow::kFloat32) override { merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); } @@ -227,8 +232,11 @@ class CommDevice : public Comm { virtual ~CommDevice() { } - void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) override { + void Init(int key, const TShape& shape, + const std::string& compress, + int dtype = mshadow::kFloat32) override { sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); + compress_ = compress; } const NDArray& Reduce(int key, const std::vector& src, diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 536a89b46e13..82d86934ea34 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -62,7 +62,9 @@ class KVStoreLocal : public KVStore { CHECK(local_.find(keys[i]) == local_.end()) << "duplicate init of key " << keys[i]; local_[keys[i]] = values[i].Copy(pinned_ctx_); - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->Init(keys[i], values[i].shape(), + compress_, + values[i].dtype()); } } From 5a99e6a0e71ba27e79090f353f8635d2cc295131 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 1 Sep 2017 16:54:52 +0000 Subject: [PATCH 020/237] update --- include/mxnet/c_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 5c148234a397..c3afca2f4fec 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1382,7 +1382,7 @@ MXNET_DLL int MXKVStoreCreate(const char *type, KVStoreHandle *out); /*! - * \brief Delete a KVStore handle. + * \brief Set to use low-bit compression * \param handle handle to the kvstore * \param compress set to use low-bit compression * \return 0 when success, -1 when failure happens From 861fca5d72a2b3eefa4989bd8bca81af2a82ed93 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 1 Sep 2017 18:58:03 +0000 Subject: [PATCH 021/237] update two bit compression --- src/kvstore/comm.h | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index b6ea293ec850..97e05eafbcf4 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -260,23 +260,32 @@ class CommDevice : public Comm { auto& buf = merge_buf_[key]; std::vector reduce(src.size()); - CopyFromTo(src[0], &(buf.merged), priority); - reduce[0] = buf.merged; if (buf.copy_buf.empty()) { // TODO(mli) this results in large device memory usage for huge ndarray, // such as the largest fullc in VGG. consider to do segment reduce with // NDArray.Slice or gpu direct memory access. for the latter, we need to // remove some ctx check, and also it reduces 20% perf - buf.copy_buf.resize(src.size()-1); - for (size_t i = 0; i < src.size()-1; ++i) { + buf.copy_buf.resize(src.size()); + buf.small_buf.resize(src.size()); + for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); + // allocation small buffer for compressed data + if (compress_.compare("none") != 0) { + int bits = compress_ == "2bit" ? 16 : 32; + long int small_size = buf.merged.shape().Size() % bits == 0 ? + buf.merged.shape().Size() / bits + 2 : + buf.merged.shape().Size() / bits + 3; + buf.small_buf[i] = NDArray( + TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); + } } } - for (size_t i = 0; i < src.size()-1; ++i) { - CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority); - reduce[i+1] = buf.copy_buf[i]; + + for (size_t i = 0; i < src.size(); ++i) { + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + reduce[i] = buf.copy_buf[i]; } ElementwiseSum(reduce, &buf.merged); @@ -386,6 +395,8 @@ class CommDevice : public Comm { NDArray merged; /// \brief the gpu buffer std::vector copy_buf; + /// \brief the small buffer for compressed data + std::vector small_buf; }; std::unordered_map merge_buf_; bool inited_; From 54c6f0600df052de0a57a77fe3f1cb79ac132092 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 1 Sep 2017 19:20:59 +0000 Subject: [PATCH 022/237] update two bit compression --- src/kvstore/comm.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 97e05eafbcf4..4e9bec620be4 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -267,7 +267,8 @@ class CommDevice : public Comm { // NDArray.Slice or gpu direct memory access. for the latter, we need to // remove some ctx check, and also it reduces 20% perf buf.copy_buf.resize(src.size()); - buf.small_buf.resize(src.size()); + buf.small_recv_buf.resize(src.size()); + buf.small_send_buf.resize(src.size()); for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); @@ -277,14 +278,23 @@ class CommDevice : public Comm { long int small_size = buf.merged.shape().Size() % bits == 0 ? buf.merged.shape().Size() / bits + 2 : buf.merged.shape().Size() / bits + 3; - buf.small_buf[i] = NDArray( + buf.small_recv_buf[i] = NDArray( TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); + buf.small_send_buf[i] = NDArray( + TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); } } } for (size_t i = 0; i < src.size(); ++i) { + // TODO: New code: + // CompressNDArray(src[i], &(buf.small_send_buf[i])); + // CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); + // DeCompressNDArray(buf.small_recv_buf[i], &(buf.copy_buf[i])); + + // TODO: Delete this line CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + reduce[i] = buf.copy_buf[i]; } @@ -395,8 +405,10 @@ class CommDevice : public Comm { NDArray merged; /// \brief the gpu buffer std::vector copy_buf; - /// \brief the small buffer for compressed data - std::vector small_buf; + /// \brief the small buffer for compressed data in sender + std::vector small_send_buf; + /// \brief the small buffer for compressed data in receiver + std::vector small_recv_buf; }; std::unordered_map merge_buf_; bool inited_; From 03e47a430371c72d49a97d057ee25817f49c9a20 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 1 Sep 2017 21:53:39 +0000 Subject: [PATCH 023/237] update two bit compression --- src/kvstore/comm.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 4e9bec620be4..24ecfebf9267 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -287,14 +287,15 @@ class CommDevice : public Comm { } for (size_t i = 0; i < src.size(); ++i) { - // TODO: New code: - // CompressNDArray(src[i], &(buf.small_send_buf[i])); - // CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); - // DeCompressNDArray(buf.small_recv_buf[i], &(buf.copy_buf[i])); - - // TODO: Delete this line - CopyFromTo(src[i], &(buf.copy_buf[i]), priority); - + // compress before copy + if (compress_.compare("none") != 0) { + // TODO: New code: wrapper for NDArray op + // Compress(src[i], &(buf.small_send_buf[i]), compress_, priority); + // CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); + // DeCompress(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); + } else { + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + } reduce[i] = buf.copy_buf[i]; } From fedd4b4c7b5805dadbf33d0fba93809bc3540743 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 18:42:40 +0000 Subject: [PATCH 024/237] update two bit compression --- include/mxnet/c_api.h | 6 +++++- include/mxnet/kvstore.h | 16 +++++++++++++++- python/mxnet/gluon/trainer.py | 17 +++++++++++++++-- python/mxnet/kvstore.py | 10 +++++++--- src/c_api/c_api.cc | 8 ++++++-- src/kvstore/comm.h | 10 ++++++++++ src/kvstore/kvstore_local.h | 2 ++ 7 files changed, 60 insertions(+), 9 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index c3afca2f4fec..e64daee00f11 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1385,10 +1385,14 @@ MXNET_DLL int MXKVStoreCreate(const char *type, * \brief Set to use low-bit compression * \param handle handle to the kvstore * \param compress set to use low-bit compression + * \param pos_threshold set the positive threshold in 2bit compress + * \param neg_threshold set the negative threshold in 2bit compress * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreSetCompress(KVStoreHandle handle, - const char *compress); + const char *compress, + const float pos_threshold, + const float neg_threshold); /*! * \brief Delete a KVStore handle. diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 0d1dd3375c84..c609d417ca48 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -66,8 +66,12 @@ class KVStore { /** * \brief set to use low-bit compression */ - inline void SetCompress(const std::string& compress) { + inline void SetCompress(const std::string& compress, + const float pos_threshold, + const float neg_threshold) { compress_ = compress; + pos_threshold_ = pos_threshold; + neg_threshold_ = neg_threshold; } /*! @@ -353,6 +357,16 @@ class KVStore { */ std::string compress_ = "none"; + /** + * \brief positive threshold + */ + float pos_threshold_; + + /** + * \brief negative threshold + */ + float neg_threshold_; + /** * \brief whether to do barrier when finalize */ diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index d2720886a574..8d70d87717c1 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -45,8 +45,13 @@ class Trainer(object): :any:`mxnet.kvstore.create` for more information. compress : str whether using low-bit compression. The argument can be 'none', '2bit', and '1bit'. + pos_threshold: + positive threshold used in 2bit compression. + neg_threshold: + negative threshold used in 2bit compression. """ - def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', compress='none'): + def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', + compress='none', pos_threshold=0.1, neg_threshold=-0.1): if isinstance(params, (dict, ParameterDict)): params = list(params.values()) if not isinstance(params, (list, tuple)): @@ -64,6 +69,10 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', c compress != '2bit' and compress != '1bit'): raise ValueError("The compress argument can only be 'none', " \ "'2bit', or '1bit'.") + if (compress == '2bit' and + (pos_threshold <= 0 or neg_threshold >= 0)): + raise ValueError("The pos_threshold must be greater than 0, and " \ + "the neg_threshold must be less than 0.") optimizer_params = optimizer_params if optimizer_params else {} self._scale = optimizer_params.get('rescale_grad', 1.0) self._contexts = self._check_contexts() @@ -71,6 +80,8 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', c self._kv_initialized = False self._kvstore = kvstore self._compress = compress + self._pos_threshold = pos_threshold + self._neg_threshold = neg_threshold def _check_contexts(self): contexts = None @@ -104,7 +115,9 @@ def _init_kvstore(self): len(self._contexts), arg_arrays) if kvstore: - kvstore.set_compress(self._compress) + kvstore.set_compress(self._compress, + self._pos_threshold, + self._neg_threshold) if 'dist' in kvstore.type: update_on_kvstore = False for i, param in enumerate(self._params): diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 95d678649766..f5813bbd8b0d 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -23,7 +23,7 @@ import pickle from .ndarray import NDArray from .base import _LIB -from .base import check_call, c_array, c_str, string_types, mx_uint, py_str +from .base import check_call, c_array, c_str, string_types, mx_uint, mx_float, py_str from .base import NDArrayHandle, KVStoreHandle from . import optimizer as opt @@ -236,7 +236,9 @@ def pull(self, key, out=None, priority=0): self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority))) - def set_compress(self, compress='none'): + def set_compress(self, compress='none', + pos_threshold=0.1, + neg_threshold=-0.1): """ Set to use low-bit compression compress can be 'none', '2bit', or '1bit'. @@ -244,7 +246,9 @@ def set_compress(self, compress='none'): if not isinstance(compress, string_types): raise TypeError('compress must be a string') check_call(_LIB.MXKVStoreSetCompress(self.handle, - c_str(compress))) + c_str(compress), + mx_float(pos_threshold), + mx_float(neg_threshold))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index d70f95540419..1c8df0f146c6 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -636,9 +636,13 @@ int MXKVStoreCreate(const char *type, } int MXKVStoreSetCompress(KVStoreHandle handle, - const char *compress) { + const char *compress, + const float pos_threshold, + const float neg_threshold) { API_BEGIN(); - static_cast(handle)->SetCompress(compress); + static_cast(handle)->SetCompress(compress, + pos_threshold, + neg_threshold); API_END(); } diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 24ecfebf9267..20f8f49d672a 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -44,6 +44,8 @@ class Comm { */ virtual void Init(int key, const TShape& shape, const std::string& compress, + float const pos_threshold, + float const neg_threshold, int dtype = mshadow::kFloat32) = 0; /** * \brief returns src[0] + .. + src[src.size()-1] @@ -67,6 +69,8 @@ class Comm { protected: Context pinned_ctx_; std::string compress_; + float pos_threshold_; + float neg_threshold_; }; /** @@ -83,6 +87,8 @@ class CommCPU : public Comm { void Init(int key, const TShape& shape, const std::string& compress, + const float pos_threshold, + const float neg_threshold, int type = mshadow::kFloat32) override { merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); } @@ -234,9 +240,13 @@ class CommDevice : public Comm { void Init(int key, const TShape& shape, const std::string& compress, + const float pos_threshold, + const float neg_threshold, int dtype = mshadow::kFloat32) override { sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); compress_ = compress; + pos_threshold_ = pos_threshold; + neg_threshold_ = neg_threshold; } const NDArray& Reduce(int key, const std::vector& src, diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 82d86934ea34..10d4448af777 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -64,6 +64,8 @@ class KVStoreLocal : public KVStore { local_[keys[i]] = values[i].Copy(pinned_ctx_); comm_->Init(keys[i], values[i].shape(), compress_, + pos_threshold_, + neg_threshold_, values[i].dtype()); } } From 13ff1bcb9d688e71ecbace026112ca495c7f1aa9 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 20:36:47 +0000 Subject: [PATCH 025/237] update two bit compression --- src/kvstore/comm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 20f8f49d672a..2417768f8745 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -279,11 +279,23 @@ class CommDevice : public Comm { buf.copy_buf.resize(src.size()); buf.small_recv_buf.resize(src.size()); buf.small_send_buf.resize(src.size()); + buf.residual.resize(src.size()); for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); // allocation small buffer for compressed data if (compress_.compare("none") != 0) { + // Residual + buf.residual[i] = NDArray( + buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); + // positive and negative threshold + buf.pos_thre = NDArray( + TShape{1}, src[i].ctx(), false, buf.merged.dtype()); + *(buf.pos_thre.data().dptr()) = pos_threshold_; + buf.neg_thre = NDArray( + TShape{1}, src[i].ctx(), false, buf.merged.dtype()); + *(buf.neg_thre.data().dptr()) = neg_threshold_; + // recv buffer and send buffer int bits = compress_ == "2bit" ? 16 : 32; long int small_size = buf.merged.shape().Size() % bits == 0 ? buf.merged.shape().Size() / bits + 2 : @@ -416,6 +428,12 @@ class CommDevice : public Comm { NDArray merged; /// \brief the gpu buffer std::vector copy_buf; + /// \brief the residual buffer + std::vector residual; + /// \brief the positive threshold + NDArray pos_thre; + /// \brief the negative threshold + NDArray neg_thre; /// \brief the small buffer for compressed data in sender std::vector small_send_buf; /// \brief the small buffer for compressed data in receiver From b75d7ca55d4115ffa6c7c240f12a7da37d6d117a Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 21:00:34 +0000 Subject: [PATCH 026/237] update two bit compression --- src/kvstore/comm.h | 23 +++++++++++------------ src/kvstore/kvstore_local.h | 7 ++----- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 2417768f8745..17e4e63a27aa 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -43,9 +43,6 @@ class Comm { * \brief init key with the data shape */ virtual void Init(int key, const TShape& shape, - const std::string& compress, - float const pos_threshold, - float const neg_threshold, int dtype = mshadow::kFloat32) = 0; /** * \brief returns src[0] + .. + src[src.size()-1] @@ -66,6 +63,17 @@ class Comm { return pinned_ctx_; } + /** + * \brief set to use low-bit compression + */ + void SetCompress(const std::string& compress, + float const pos_threshold, + float const neg_threshold) { + compress_ = compress; + pos_threshold_ = pos_threshold; + neg_threshold_ = neg_threshold; + } + protected: Context pinned_ctx_; std::string compress_; @@ -86,9 +94,6 @@ class CommCPU : public Comm { virtual ~CommCPU() { } void Init(int key, const TShape& shape, - const std::string& compress, - const float pos_threshold, - const float neg_threshold, int type = mshadow::kFloat32) override { merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type); } @@ -239,14 +244,8 @@ class CommDevice : public Comm { virtual ~CommDevice() { } void Init(int key, const TShape& shape, - const std::string& compress, - const float pos_threshold, - const float neg_threshold, int dtype = mshadow::kFloat32) override { sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype)); - compress_ = compress; - pos_threshold_ = pos_threshold; - neg_threshold_ = neg_threshold; } const NDArray& Reduce(int key, const std::vector& src, diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 10d4448af777..5c1476b6d11b 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -62,11 +62,8 @@ class KVStoreLocal : public KVStore { CHECK(local_.find(keys[i]) == local_.end()) << "duplicate init of key " << keys[i]; local_[keys[i]] = values[i].Copy(pinned_ctx_); - comm_->Init(keys[i], values[i].shape(), - compress_, - pos_threshold_, - neg_threshold_, - values[i].dtype()); + comm_->Init(keys[i], values[i].shape(), values[i].dtype()); + comm_->SetCompress(compress_, pos_threshold_, neg_threshold_); } } From b84b7622a82b00cec66b70727f1d30bef716905f Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 21:11:24 +0000 Subject: [PATCH 027/237] update two bit compression --- src/kvstore/comm.h | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 17e4e63a27aa..0088a34f87c7 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -309,12 +309,33 @@ class CommDevice : public Comm { for (size_t i = 0; i < src.size(); ++i) { // compress before copy - if (compress_.compare("none") != 0) { - // TODO: New code: wrapper for NDArray op - // Compress(src[i], &(buf.small_send_buf[i]), compress_, priority); - // CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); - // DeCompress(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); - } else { + if (compress_.compare("2bit") == 0) { + // TODO: New code: wrapper for NDArray quantize_2bit op + /* + Compress2Bit(src[i], buf.residual[i], + buf.pos_thre, buf.neg_thre, + &(buf.small_send_buf[i]), priority); + CopyFromTo(buf.small_send_buf[i], + &(buf.small_recv_buf[i]), + priority); + DeCompress2Bit(buf.small_recv_buf[i], + &(buf.copy_buf[i]), + priority); + */ + } else if (compress_.compare("1bit") == 0) { + // TODO: New code: wrapper for NDArray quantize_1bit op + /* + Compress1Bit(src[i], buf.residual[i], + &(buf.small_send_buf[i]), + priority); + CopyFromTo(buf.small_send_buf[i], + &(buf.small_recv_buf[i]), + priority); + DeCompress1Bit(buf.small_recv_buf[i], + &(buf.copy_buf[i]), + priority); + */ + } else { // Do not compress CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } reduce[i] = buf.copy_buf[i]; From 260b606942c546e6ce7422b6866899bc72c280de Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 22:48:34 +0000 Subject: [PATCH 028/237] update two bit compression --- src/kvstore/comm.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 0088a34f87c7..0cc3fc16b486 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -279,6 +279,9 @@ class CommDevice : public Comm { buf.small_recv_buf.resize(src.size()); buf.small_send_buf.resize(src.size()); buf.residual.resize(src.size()); + pos_thre.resize(src.size()); + neg_thre.resize(src.size()); + for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); @@ -287,13 +290,7 @@ class CommDevice : public Comm { // Residual buf.residual[i] = NDArray( buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); - // positive and negative threshold - buf.pos_thre = NDArray( - TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - *(buf.pos_thre.data().dptr()) = pos_threshold_; - buf.neg_thre = NDArray( - TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - *(buf.neg_thre.data().dptr()) = neg_threshold_; + // TODO set residual to zero // recv buffer and send buffer int bits = compress_ == "2bit" ? 16 : 32; long int small_size = buf.merged.shape().Size() % bits == 0 ? @@ -303,6 +300,15 @@ class CommDevice : public Comm { TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); buf.small_send_buf[i] = NDArray( TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); + // The positive and negative threshold + if (compress_.compare("2bit") == 0) { + pos_thre[i] = NDArray( + TShape{1}, src[i].ctx(), false, buf.merged.dtype()); + // TODO set pos_thre to pos_threshold_ + neg_thre[i] = NDArray( + TShape{1}, src[i].ctx(), false, buf.merged.dtype()); + // TODO set neg_thre to neg_threshold_ + } } } } @@ -313,7 +319,7 @@ class CommDevice : public Comm { // TODO: New code: wrapper for NDArray quantize_2bit op /* Compress2Bit(src[i], buf.residual[i], - buf.pos_thre, buf.neg_thre, + pos_thre[i], neg_thre[i], &(buf.small_send_buf[i]), priority); CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), @@ -322,6 +328,7 @@ class CommDevice : public Comm { &(buf.copy_buf[i]), priority); */ + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else if (compress_.compare("1bit") == 0) { // TODO: New code: wrapper for NDArray quantize_1bit op /* @@ -335,6 +342,7 @@ class CommDevice : public Comm { &(buf.copy_buf[i]), priority); */ + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else { // Do not compress CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } @@ -450,16 +458,16 @@ class CommDevice : public Comm { std::vector copy_buf; /// \brief the residual buffer std::vector residual; - /// \brief the positive threshold - NDArray pos_thre; - /// \brief the negative threshold - NDArray neg_thre; /// \brief the small buffer for compressed data in sender std::vector small_send_buf; /// \brief the small buffer for compressed data in receiver std::vector small_recv_buf; }; std::unordered_map merge_buf_; + + // \brief the positive and negative threshold + std::vector pos_thre; + std::vector neg_thre; bool inited_; }; From 7d78e3a291ed030343b47ed2b29e6c0d1980c73c Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 5 Sep 2017 23:32:22 +0000 Subject: [PATCH 029/237] update two bit compression --- src/kvstore/comm.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 0cc3fc16b486..7810b466f65c 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -290,7 +290,7 @@ class CommDevice : public Comm { // Residual buf.residual[i] = NDArray( buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); - // TODO set residual to zero + buf.residual[i] = 0; // recv buffer and send buffer int bits = compress_ == "2bit" ? 16 : 32; long int small_size = buf.merged.shape().Size() % bits == 0 ? @@ -304,10 +304,10 @@ class CommDevice : public Comm { if (compress_.compare("2bit") == 0) { pos_thre[i] = NDArray( TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - // TODO set pos_thre to pos_threshold_ + pos_thre[i] = pos_threshold_; neg_thre[i] = NDArray( TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - // TODO set neg_thre to neg_threshold_ + neg_thre[i] = neg_threshold_; } } } @@ -328,7 +328,6 @@ class CommDevice : public Comm { &(buf.copy_buf[i]), priority); */ - CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else if (compress_.compare("1bit") == 0) { // TODO: New code: wrapper for NDArray quantize_1bit op /* @@ -342,7 +341,6 @@ class CommDevice : public Comm { &(buf.copy_buf[i]), priority); */ - CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else { // Do not compress CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } From 2a90daec6305ced709b1f319f7a5f607732ede97 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 6 Sep 2017 18:24:43 +0000 Subject: [PATCH 030/237] update two bit compression --- src/kvstore/kvstore_local.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 8d7a6a086e24..1e2cb52b3b5e 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -66,13 +66,6 @@ class KVStoreLocal : public KVStore { void Init(const std::vector& keys, const std::vector& values) override { - for (size_t i = 0; i < keys.size(); ++i) { - CHECK(local_.find(keys[i]) == local_.end()) - << "duplicate init of key " << keys[i]; - local_[keys[i]] = values[i].Copy(pinned_ctx_); - comm_->Init(keys[i], values[i].shape(), values[i].dtype()); - comm_->SetCompress(compress_, pos_threshold_, neg_threshold_); - } SetKeyType(kIntKey); Init_(keys, values); } @@ -151,6 +144,7 @@ class KVStoreLocal : public KVStore { local_[keys[i]] = values[i].Copy(pinned_ctx_); comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } + comm_->SetCompress(compress_, pos_threshold_, neg_threshold_); } void Push_(const std::vector& keys, From baba1d89fd2873b6c864a2d2466d5d896d7c3b9f Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 6 Sep 2017 21:40:18 +0000 Subject: [PATCH 031/237] update two bit compression --- include/mxnet/ndarray.h | 6 ++ src/common/utils.cc | 11 +++ src/common/utils.cu | 11 +++ src/common/utils.h | 6 ++ src/kvstore/comm.h | 4 + src/ndarray/ndarray.cc | 91 +++++++++++++++++++++ src/operator/contrib/two_bit_quantize-inl.h | 41 ++++++---- 7 files changed, 154 insertions(+), 16 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 754bc28e7bed..692f56d2ca19 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -905,6 +905,12 @@ size_t num_aux_data(NDArrayStorageType stype); */ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, + const NDArray &pos_threshold, const NDArray &neg_threshold, + std::string& compress, int priority); + +void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority); + /*! * \brief Perform elementwise sum over each data from source, store result into out. * \param source the ndarray we want to sum diff --git a/src/common/utils.cc b/src/common/utils.cc index 125e4e5dc7d7..ecf6e0d580fb 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -24,6 +24,7 @@ #include "./utils.h" #include "../operator/tensor/cast_storage-inl.h" +#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace common { @@ -35,5 +36,15 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } +template<> +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Dequantize2BitImpl(s,inputs); +} + +template<> +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Quantize2BitImpl(s,inputs); +} + } // namespace common } // namespace mxnet diff --git a/src/common/utils.cu b/src/common/utils.cu index 093480a98907..7620bd18984d 100644 --- a/src/common/utils.cu +++ b/src/common/utils.cu @@ -24,6 +24,7 @@ #include "./utils.h" #include "../operator/tensor/cast_storage-inl.h" +#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace common { @@ -35,5 +36,15 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } +template<> +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Dequantize2BitImpl(s,inputs); +} + +template<> +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Quantize2BitImpl(s,inputs); +} + } // namespace common } // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index 92631a9b5c34..15a48d5f4600 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -48,6 +48,12 @@ namespace common { template void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output); +template +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); + +template +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); + /* * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default * storage, it creates a temp NDArray with default storage and uses the temp tblob. The diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 5211d6f75e05..b078ad06f0c3 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -569,6 +569,10 @@ class CommDevice : public Comm { &(buf.copy_buf[i]), priority); */ + Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), + pos_thre[i], neg_thre[i], compress_, priority); + CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); + Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); } else if (compress_.compare("1bit") == 0) { // TODO: New code: wrapper for NDArray quantize_1bit op /* diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 7b79d1051135..a618a3ec9bec 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -547,6 +547,97 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) { } } +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, + const NDArray &pos_threshold, const NDArray &neg_threshold, + std::string& compress, int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + NDArray ret = *to; + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + std::vector const_vars; + const_vars.push_back(from.var()); + + std::vector inputs(5); + inputs[0] = from.data(); + inputs[1] = residual->data(); + inputs[2] = neg_threshold.data(); + inputs[3] = pos_threshold.data(); + inputs[4] = to->data(); + + if (a == cpu::kDevMask && b == cpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([inputs](RunContext ctx) { + common::Quantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { + LOG(FATAL) << "Unsupported dequantization"; + } + } else { +#if MXNET_USE_CUDA + if (a == gpu::kDevMask && b == gpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([inputs](RunContext ctx) { + common::Quantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "Unsupported dequantization"; + } + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } +} + +void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + NDArray ret = *to; + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + std::vector const_vars; + const_vars.push_back(from.var()); + + std::vector inputs(2); + inputs[0] = from.data(); + inputs[1] = to->data(); + + if (a == cpu::kDevMask && b == cpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([inputs](RunContext ctx) { + common::Dequantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { + LOG(FATAL) << "Unsupported dequantization"; + } + } else { +#if MXNET_USE_CUDA + if (a == gpu::kDevMask && b == gpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([inputs](RunContext ctx) { + common::Dequantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), const_vars, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "Unsupported dequantization"; + } + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } +} + void ElementwiseSum(const std::vector &source, NDArray *out, int priority) { std::vector const_vars; const_vars.reserve(source.size()); diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index e503a3e21373..99cc6850f5fc 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -160,15 +160,9 @@ struct quantize_2bit { }; template -void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - // For now, this method can only compress the float data +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { using namespace mshadow; using namespace mxnet_op; - Stream *s = ctx.get_stream(); // First, init the memory of output to 0x00000000 Kernel::Launch(s, inputs[4].Size(), inputs[4].dptr()); // compressed array @@ -186,6 +180,17 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, inputs[3].dptr()); // positive threshold } +template +void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + // For now, this method can only compress the float data + mshadow::Stream *s = ctx.get_stream(); + Quantize2BitImpl(s, inputs); +} + inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { @@ -303,22 +308,26 @@ struct dequantize_2bit { }; template -void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - using namespace mshadow; - using namespace mxnet_op; - Stream *s = ctx.get_stream(); +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { + //using namespace mshadow; // For now, this method can only decompress the float data - Kernel::Launch(s, inputs[1].Size(), // original size + mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr()+2, // compressed array inputs[0].dptr(), // negative threshold inputs[0].dptr()+1); // positve threshold } +template +void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + Dequantize2BitImpl(s, inputs); +} + inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { From aac529245d234c2cd4ae2f6abc70cd784dd5058a Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 6 Sep 2017 23:03:43 +0000 Subject: [PATCH 032/237] update two bit compression --- src/common/utils.cc | 11 ----------- src/common/utils.cu | 11 ----------- src/common/utils.h | 6 ------ src/ndarray/ndarray.cc | 9 +++++---- src/ndarray/ndarray_function.cc | 11 +++++++++++ src/ndarray/ndarray_function.cu | 12 ++++++++++++ src/ndarray/ndarray_function.h | 6 ++++++ 7 files changed, 34 insertions(+), 32 deletions(-) diff --git a/src/common/utils.cc b/src/common/utils.cc index ecf6e0d580fb..125e4e5dc7d7 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -24,7 +24,6 @@ #include "./utils.h" #include "../operator/tensor/cast_storage-inl.h" -#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace common { @@ -36,15 +35,5 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } -template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s,inputs); -} - -template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Quantize2BitImpl(s,inputs); -} - } // namespace common } // namespace mxnet diff --git a/src/common/utils.cu b/src/common/utils.cu index 7620bd18984d..093480a98907 100644 --- a/src/common/utils.cu +++ b/src/common/utils.cu @@ -24,7 +24,6 @@ #include "./utils.h" #include "../operator/tensor/cast_storage-inl.h" -#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace common { @@ -36,15 +35,5 @@ void CastStorageDispatch(const OpContext& ctx, mxnet::op::CastStorageComputeImpl(ctx, input, output); } -template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s,inputs); -} - -template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Quantize2BitImpl(s,inputs); -} - } // namespace common } // namespace mxnet diff --git a/src/common/utils.h b/src/common/utils.h index 15a48d5f4600..92631a9b5c34 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -48,12 +48,6 @@ namespace common { template void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output); -template -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); - -template -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); - /* * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default * storage, it creates a temp NDArray with default storage and uses the temp tblob. The diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a618a3ec9bec..2e5fd2159d9c 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -34,6 +34,7 @@ #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" #include "./autograd.h" +#include "./ndarray_function.h" #if MXNET_USE_OPENCV #include @@ -569,7 +570,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - common::Quantize2BitDispatch(ctx.get_stream(), inputs); + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { @@ -580,7 +581,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - common::Quantize2BitDispatch(ctx.get_stream(), inputs); + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { @@ -612,7 +613,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - common::Dequantize2BitDispatch(ctx.get_stream(), inputs); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { @@ -623,7 +624,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - common::Dequantize2BitDispatch(ctx.get_stream(), inputs); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 5cea7942efa6..9c2f93cc796c 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -26,6 +26,7 @@ #include "./ndarray_function.h" #include "./ndarray_function-inl.h" #include "../common/utils.h" +#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace ndarray { @@ -178,5 +179,15 @@ void ElementwiseSum(mshadow::Stream* s, } } +template<> +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Dequantize2BitImpl(s,inputs); +} + +template<> +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Quantize2BitImpl(s,inputs); +} + } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 30d532673cff..48bb3458f2ef 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -26,6 +26,7 @@ #include #include "./ndarray_function.h" #include "./ndarray_function-inl.h" +#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace ndarray { @@ -88,5 +89,16 @@ void Copy(const TBlob &from, TBlob *to, s->stream_); } } + +template<> +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Dequantize2BitImpl(s,inputs); +} + +template<> +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { + mxnet::op::Quantize2BitImpl(s,inputs); +} + } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index 65c59185f691..f0a5e7ecdd3b 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -164,6 +164,12 @@ void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx); +template +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); + +template +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); + template void ElementwiseSum(const std::vector source, TBlob *out, From b63673ab47df7fe2de61cef5e0a1ef9fd43759d8 Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Mon, 11 Sep 2017 13:13:44 -0700 Subject: [PATCH 033/237] Update comm.h --- src/kvstore/comm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index b078ad06f0c3..fec91757dfc0 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -93,7 +93,7 @@ class Comm { protected: Context pinned_ctx_; - std::string compress_; + std::string compress_ = "none"; float pos_threshold_; float neg_threshold_; }; From 3d3ac921b7350ddd5e4237ad76ea43965cdfb4ae Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 14 Sep 2017 22:04:26 +0000 Subject: [PATCH 034/237] add original size in comrpessed array --- src/operator/contrib/two_bit_quantize-inl.h | 22 ++++++++++++--------- src/operator/contrib/two_bit_quantize.cc | 9 +++++---- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 99cc6850f5fc..d98d0c040571 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -67,8 +67,8 @@ inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, CHECK(!shape_is_none(in_attrs->at(0))); // output int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 2: - in_attrs->at(0).Size() / 16 + 3; + in_attrs->at(0).Size() / 16 + 3: + in_attrs->at(0).Size() / 16 + 4; SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); return true; } @@ -91,10 +91,13 @@ struct init_threshold_2bit { MSHADOW_XINLINE static void Map(int i, float *out, const float *neg_threshold, - const float *pos_threshold) { + const float *pos_threshold, + int size) { // The first two elments in output is threshold + // The third element is the original size of the array out[0] = *neg_threshold; out[1] = *pos_threshold; + out[2] = (float)size; } }; @@ -166,14 +169,15 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) // First, init the memory of output to 0x00000000 Kernel::Launch(s, inputs[4].Size(), inputs[4].dptr()); // compressed array - // Then, init threshold + // Then, init threshold and original size Kernel::Launch(s, 1, inputs[4].dptr(), // compressed array inputs[2].dptr(), // negative threshold - inputs[3].dptr()); // positive threshold + inputs[3].dptr(), // positive threshold + inputs[0].Size()); // original size // Finally, compress the data and calculate new residual Kernel::Launch(s, inputs[0].Size(), - inputs[4].dptr()+2, // compressed array + inputs[4].dptr()+3, // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array inputs[2].dptr(), // negative threshold @@ -207,8 +211,8 @@ inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, CHECK(shape_is_scalar(in_attrs->at(2))); CHECK(shape_is_scalar(in_attrs->at(3))); int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 2: - in_attrs->at(0).Size() / 16 + 3; + in_attrs->at(0).Size() / 16 + 3: + in_attrs->at(0).Size() / 16 + 4; CHECK_EQ(in_attrs->at(4).Size(), shape) << "The size of output array is not equal to " << "the size of compressed array"; @@ -313,7 +317,7 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& input // For now, this method can only decompress the float data mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array - inputs[0].dptr()+2, // compressed array + inputs[0].dptr()+3, // compressed array inputs[0].dptr(), // negative threshold inputs[0].dptr()+1); // positve threshold } diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 28968583a471..59136e2ef9c6 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -42,10 +42,11 @@ quantize_2bit(array, residual, neg_threshold, pos_threshold, out), the 'out' will be the compressed array. Note that, the out array can be generated by invoking create_2bit(array). -In this example, the 'out' has 3 elements. The first element stores the -neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), -and the original array will be compressed into a single element in -the third element. In two bit compress, every 16 float data in original array +In this example, the 'out' has 4 elements. The first element stores the +neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), the +third element stores the original size of the uncompressed array, and the +original array will be compressed into a single element in the last element. +In two bit compress, every 16 float data in original array will be packed into one float data in output array. )code" ADD_FILELINE) .set_num_inputs(5) From f7972712b0127b7ec502ce16852701ab15effd69 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Thu, 14 Sep 2017 22:15:35 +0000 Subject: [PATCH 035/237] update comm.h --- src/kvstore/comm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index fec91757dfc0..5c9da4e470af 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -535,8 +535,8 @@ class CommDevice : public Comm { // recv buffer and send buffer int bits = compress_ == "2bit" ? 16 : 32; long int small_size = buf.merged.shape().Size() % bits == 0 ? - buf.merged.shape().Size() / bits + 2 : - buf.merged.shape().Size() / bits + 3; + buf.merged.shape().Size() / bits + 3 : + buf.merged.shape().Size() / bits + 4; buf.small_recv_buf[i] = NDArray( TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); buf.small_send_buf[i] = NDArray( From 580746982d1965fe8cc535b6eea6c790dc45c74f Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 15 Sep 2017 17:35:13 +0000 Subject: [PATCH 036/237] update distributed training --- src/kvstore/kvstore_dist.h | 88 +++++++++++++++++++++++++++++++++++--- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 6ce6b5adaf86..20af388180cd 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -31,6 +31,7 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" +#include "../ndarray/ndarray_function.h" #if MKL_EXPERIMENTAL == 1 #include #include "../operator/mkl/mkl_memory-inl.h" @@ -269,30 +270,96 @@ class KVStoreDist : public KVStoreLocal { NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; auto& send_buf = comm_buf_[key]; + auto& small_buf = comm_small_buf_[key]; + auto& res_buf = residual_[key]; + + // Init the small buffer and residual_ buffer for quantize + if (small_buf.is_none() && compress_ != "none") { + int bits = compress_ == "2bit" ? 16 : 32; + long int small_size = merged.shape().Size() % bits == 0 ? + merged.shape().Size() / bits + 3 : + merged.shape().Size() / bits + 4; + // small buffer for quantize + small_buf = NDArray(TShape{small_size}, + merged.ctx(), + false, + merged.dtype()); + // residual buffer for quantize + res_buf = NDArray(merged.shape(), + merged.ctx(), + false, + merged.dtype()); + } + // Init positive and negative threshold + if (pos_thre_.is_none() && compress_ != "none") { + // positive threshold + pos_thre_ = NDArray(TShape{1}, merged.ctx(), + false, merged.dtype()); + pos_thre_ = pos_threshold_; + // negative threshold + neg_thre_ = NDArray(TShape{1}, merged.ctx(), + false, merged.dtype()); + neg_thre_ = neg_threshold_; + } + + // Compress + if (compress_ == "2bit") { + Quantize(merged, &small_buf, &res_buf, + pos_thre_, neg_thre_, + compress_, + priority); + } + const auto storage_type = merged.storage_type(); if (merged.ctx().dev_mask() == cpu::kDevMask) { // make sure the previous push/pull is completed send_buf.WaitToWrite(); - send_buf = merged; // avoid memory copy + if (compress_ == "none") { + send_buf = merged; // avoid memory copy + } else { + send_buf = small_buf; // avoid memory copy + } } else { if (send_buf.is_none()) { if (storage_type == kDefaultStorage) { - send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); + if (compress_ == "none") { + send_buf = NDArray(merged.shape(), + pinned_ctx_, true, merged.dtype()); + } else { + send_buf = NDArray(small_buf.shape(), + pinned_ctx_, true, small_buf.dtype()); + } } else { - send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); + if (compress_ == "none") { + send_buf = NDArray(storage_type, merged.shape(), + pinned_ctx_, true, merged.dtype()); + } else { + send_buf = NDArray(storage_type, small_buf.shape(), + pinned_ctx_, true, small_buf.dtype()); + } } } - CopyFromTo(merged, &send_buf); + if (compress_ == "none") { + CopyFromTo(merged, &send_buf); + } else { + CopyFromTo(small_buf, &send_buf); + } } // push to servers if (storage_type == kDefaultStorage) { auto push_to_servers = - [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, send_buf, merged](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys - size_t size = send_buf.shape().Size(); + size_t size = 0; + if (compress_ == "none") { + size = send_buf.shape().Size(); + } else { + size = merged.shape().Size(); + } PSKV& pskv = EncodeKey(key, size); + #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif @@ -539,6 +606,15 @@ class KVStoreDist : public KVStoreLocal { size_t bigarray_bound_; /// \brief send & recver buffer std::unordered_map comm_buf_; + + /// \brief small buffer for quantize + std::unordered_map comm_small_buf_; + /// \brief residual buffer for quantize + std::unordered_map residual_; + /// \brief threshold for quantize + NDArray pos_thre_; + NDArray neg_thre_; + bool log_verbose_; }; From 7dbce8bb0011e4fa09b1dfeef91c3d265fc6b49d Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 15 Sep 2017 20:21:22 +0000 Subject: [PATCH 037/237] update distributed training --- src/kvstore/kvstore_dist.h | 1 + src/kvstore/kvstore_dist_server.h | 39 ++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 20af388180cd..422cbbfdd013 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -120,6 +120,7 @@ class KVStoreDist : public KVStoreLocal { if (IsServerNode()) { server_ = new KVStoreDistServer(); server_->set_controller(controller); + server_->set_compress(compress_); } ps::StartAsync("mxnet_server\0"); diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 88bdcab69e16..35eae25823a4 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -35,6 +35,7 @@ #include "mxnet/kvstore.h" #include "../operator/tensor/elemwise_binary_op.h" #include "../operator/tensor/init_op.h" +#include "../ndarray/ndarray_function.h" namespace mxnet { namespace kvstore { @@ -128,6 +129,10 @@ class KVStoreDistServer { controller_ = controller; } + void set_compress(const std::string& compress) { + compress_ = compress; + } + void set_updater(const KVStore::Updater& updater) { CHECK(updater); updater_ = updater; @@ -380,10 +385,24 @@ class KVStoreDistServer { TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); + + NDArray comp_buf = compress_buf_[key]; + if (compress_ != "none") { + long int original_size = (long int)(*(recv_blob.dptr()+2)); + dshape = TShape{original_size}; + if (comp_buf.is_none()) { + comp_buf = NDArray(dshape, Context()); + } + } + if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); - CopyFromTo(recved, &stored, 0); + if (compress_ == "none") { + CopyFromTo(recved, &stored, 0); + } else { + Dequantize(recved, &stored, compress_, 0); + } server->Response(req_meta); stored.WaitToRead(); } else if (sync_mode_) { @@ -393,9 +412,19 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - CopyFromTo(recved, &merged.array, 0); + if (compress_ == "none") { + CopyFromTo(recved, &merged.array, 0); + } else { + Dequantize(recved, &comp_buf, compress_, 0); + CopyFromTo(comp_buf, &merged.array, 0); + } } else { - merged.array += recved; + if (compress_ == "none") { + merged.array += recved; + } else { + Dequantize(recved, &comp_buf, compress_, 0); + merged.array += comp_buf; + } } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); @@ -435,12 +464,16 @@ class KVStoreDistServer { std::unordered_map store_; std::unordered_map merge_buf_; + std::unordered_map compress_buf_; Executor exec_; ps::KVServer* ps_server_; // whether to LOG verbose information bool log_verbose_; + + // set to use gradient compression + std::string compress_; }; } // namespace kvstore From 112b683ac76ffe0335ab8745d1023f2ac77f3cac Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Fri, 15 Sep 2017 13:34:20 -0700 Subject: [PATCH 038/237] Update ndarray_function.cu --- src/ndarray/ndarray_function.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index dde1eb02f2c0..8e6ed47b1a3d 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -211,6 +211,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { mxnet::op::Quantize2BitImpl(s,inputs); +} } // namespace ndarray } // namespace mxnet From fe10b7a0546f085932830dd2bd7127be99b28e0c Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Fri, 15 Sep 2017 14:39:20 -0700 Subject: [PATCH 039/237] Update kvstore_dist.h --- src/kvstore/kvstore_dist.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 422cbbfdd013..e0ecb96b367b 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -352,12 +352,7 @@ class KVStoreDist : public KVStoreLocal { auto push_to_servers = [this, key, send_buf, merged](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys - size_t size = 0; - if (compress_ == "none") { - size = send_buf.shape().Size(); - } else { - size = merged.shape().Size(); - } + size_t size = send_buf.shape().Size(); PSKV& pskv = EncodeKey(key, size); From cea9199350e726e0fd3137b4af276bff7179278c Mon Sep 17 00:00:00 2001 From: Chao Ma Date: Fri, 15 Sep 2017 15:08:02 -0700 Subject: [PATCH 040/237] Update kvstore_dist.h --- src/kvstore/kvstore_dist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index e0ecb96b367b..f5bf9c7c1bae 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -350,7 +350,7 @@ class KVStoreDist : public KVStoreLocal { // push to servers if (storage_type == kDefaultStorage) { auto push_to_servers = - [this, key, send_buf, merged](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = send_buf.shape().Size(); PSKV& pskv = EncodeKey(key, size); From 0ad7acc4a4f79f088fbab10445aa092eb3dfa016 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 15 Sep 2017 22:40:04 +0000 Subject: [PATCH 041/237] update --- src/kvstore/kvstore_dist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index f5bf9c7c1bae..37704d9e39f9 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -496,7 +496,7 @@ class KVStoreDist : public KVStoreLocal { mu_.unlock(); if (!pskv.keys.empty()) { - CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; + //CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; } else { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); From e44f8fb0b2859b5c68c10faf0caf94c2c6a382a9 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Fri, 15 Sep 2017 23:16:21 +0000 Subject: [PATCH 042/237] update --- example/image-classification/common/fit.py | 2 +- example/image-classification/hosts | 5 +++++ src/kvstore/kvstore_dist.h | 17 ++++++++++++----- 3 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 example/image-classification/hosts diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index aeead0f82a3b..53ff11b25435 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -114,7 +114,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - + kv.set_compress('2bit', 0.1, -0.1) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/example/image-classification/hosts b/example/image-classification/hosts new file mode 100644 index 000000000000..ea9f2cc6c033 --- /dev/null +++ b/example/image-classification/hosts @@ -0,0 +1,5 @@ +172.31.17.116 +172.31.34.134 +172.31.69.158 +172.31.67.2 + diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 37704d9e39f9..6aaf77d54866 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -187,7 +187,7 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size); + PSKV& pskv = EncodeKey(key, size, false); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -353,8 +353,7 @@ class KVStoreDist : public KVStoreLocal { [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = send_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size); - + PSKV& pskv = EncodeKey(key, size, true); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); @@ -480,6 +479,8 @@ class KVStoreDist : public KVStoreLocal { /** * \brief cache all key partitions */ + std::unordered_map push_ps_kv_; + std::unordered_map pull_ps_kv_; std::unordered_map ps_kv_; /** @@ -490,13 +491,19 @@ class KVStoreDist : public KVStoreLocal { /** * \brief convert to keys in ps */ - inline PSKV& EncodeKey(int key, size_t size) { + inline PSKV& EncodeKey(int key, size_t size, bool is_push) { mu_.lock(); PSKV& pskv = ps_kv_[key]; + if (is_push) { + pskv = push_ps_kv_[key]; + } else { + pskv = pull_ps_kv_[key]; + } mu_.unlock(); if (!pskv.keys.empty()) { - //CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; + // For compress, we cannt check here + CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; } else { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); From 09ceb54b41a785e08a304e265065770c15dc03ba Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 18 Sep 2017 17:17:49 +0000 Subject: [PATCH 043/237] update --- src/kvstore/kvstore_dist.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 6aaf77d54866..0d3fa2626b65 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -174,7 +174,7 @@ class KVStoreDist : public KVStoreLocal { int key = uniq_keys[i]; // use the same array for merging to guarantee that pull always happens // after the previous push on this key - auto& recv_buf = comm_buf_[key]; + auto& recv_buf = recv_comm_buf_[key]; const auto storage_type = grouped_vals[i][0]->storage_type(); CHECK_EQ(storage_type, kDefaultStorage) << "Expected stype of value to be kDefaultStorage"; @@ -270,7 +270,7 @@ class KVStoreDist : public KVStoreLocal { const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; - auto& send_buf = comm_buf_[key]; + auto& send_buf = send_comm_buf_[key]; auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; @@ -608,6 +608,9 @@ class KVStoreDist : public KVStoreLocal { */ size_t bigarray_bound_; /// \brief send & recver buffer + std::unordered_map send_comm_buf_; + std::unordered_map recv_comm_buf_; + std::unordered_map comm_buf_; /// \brief small buffer for quantize From 09971bf4a44daf5b0541427588109472f4844b92 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 18 Sep 2017 21:48:23 +0000 Subject: [PATCH 044/237] fix bug --- src/kvstore/kvstore_dist.h | 119 +++++++++++++++---------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 0d3fa2626b65..95add6d9db81 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -174,7 +174,7 @@ class KVStoreDist : public KVStoreLocal { int key = uniq_keys[i]; // use the same array for merging to guarantee that pull always happens // after the previous push on this key - auto& recv_buf = recv_comm_buf_[key]; + auto& recv_buf = comm_buf_[key]; const auto storage_type = grouped_vals[i][0]->storage_type(); CHECK_EQ(storage_type, kDefaultStorage) << "Expected stype of value to be kDefaultStorage"; @@ -270,7 +270,23 @@ class KVStoreDist : public KVStoreLocal { const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; - auto& send_buf = send_comm_buf_[key]; + auto& send_buf = comm_buf_[key]; + const auto storage_type = merged.storage_type(); + if (merged.ctx().dev_mask() == cpu::kDevMask) { + // make sure the previous push/pull is completed + send_buf.WaitToWrite(); + send_buf = merged; // avoid memory copy + } else { + if (send_buf.is_none()) { + if (storage_type == kDefaultStorage) { + send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); + } else { + send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); + } + } + CopyFromTo(merged, &send_buf); + } + auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; @@ -282,83 +298,53 @@ class KVStoreDist : public KVStoreLocal { merged.shape().Size() / bits + 4; // small buffer for quantize small_buf = NDArray(TShape{small_size}, - merged.ctx(), - false, - merged.dtype()); + send_buf.ctx(), false, send_buf.dtype()); // residual buffer for quantize - res_buf = NDArray(merged.shape(), - merged.ctx(), - false, - merged.dtype()); - } - // Init positive and negative threshold - if (pos_thre_.is_none() && compress_ != "none") { - // positive threshold - pos_thre_ = NDArray(TShape{1}, merged.ctx(), - false, merged.dtype()); - pos_thre_ = pos_threshold_; - // negative threshold - neg_thre_ = NDArray(TShape{1}, merged.ctx(), - false, merged.dtype()); - neg_thre_ = neg_threshold_; + res_buf = NDArray(merged.shape(), send_buf.ctx(), + false, send_buf.dtype()); + res_buf = 0; + if (pos_thre_.is_none()) { + // positive threshold + pos_thre_ = NDArray(TShape{1}, send_buf.ctx(), + false, send_buf.dtype()); + pos_thre_ = pos_threshold_; + // negative threshold + neg_thre_ = NDArray(TShape{1}, send_buf.ctx(), + false, send_buf.dtype()); + neg_thre_ = neg_threshold_; + } } // Compress if (compress_ == "2bit") { - Quantize(merged, &small_buf, &res_buf, + Quantize(send_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, priority); } - const auto storage_type = merged.storage_type(); - if (merged.ctx().dev_mask() == cpu::kDevMask) { - // make sure the previous push/pull is completed - send_buf.WaitToWrite(); - if (compress_ == "none") { - send_buf = merged; // avoid memory copy - } else { - send_buf = small_buf; // avoid memory copy - } - } else { - if (send_buf.is_none()) { - if (storage_type == kDefaultStorage) { - if (compress_ == "none") { - send_buf = NDArray(merged.shape(), - pinned_ctx_, true, merged.dtype()); - } else { - send_buf = NDArray(small_buf.shape(), - pinned_ctx_, true, small_buf.dtype()); - } - } else { - if (compress_ == "none") { - send_buf = NDArray(storage_type, merged.shape(), - pinned_ctx_, true, merged.dtype()); - } else { - send_buf = NDArray(storage_type, small_buf.shape(), - pinned_ctx_, true, small_buf.dtype()); - } - } - } - if (compress_ == "none") { - CopyFromTo(merged, &send_buf); - } else { - CopyFromTo(small_buf, &send_buf); - } - } - // push to servers if (storage_type == kDefaultStorage) { auto push_to_servers = - [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, send_buf, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys - size_t size = send_buf.shape().Size(); + size_t size = 0; + if (compress_ == "none") { + size = send_buf.shape().Size(); + } else { + size = small_buf.shape().Size(); + } PSKV& pskv = EncodeKey(key, size, true); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif - real_t* data = send_buf.data().dptr(); + real_t* data = nullptr; + if (compress_ == "none") { + data = send_buf.data().dptr(); + } else { + data = small_buf.data().dptr(); + } // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -479,10 +465,9 @@ class KVStoreDist : public KVStoreLocal { /** * \brief cache all key partitions */ + std::unordered_map ps_kv_; std::unordered_map push_ps_kv_; std::unordered_map pull_ps_kv_; - std::unordered_map ps_kv_; - /** * \brief serizelize EncodeRowSparseKey and EncodeKey */ @@ -493,10 +478,8 @@ class KVStoreDist : public KVStoreLocal { */ inline PSKV& EncodeKey(int key, size_t size, bool is_push) { mu_.lock(); - PSKV& pskv = ps_kv_[key]; - if (is_push) { - pskv = push_ps_kv_[key]; - } else { + PSKV& pskv = push_ps_kv_[key]; + if (!is_push) { pskv = pull_ps_kv_[key]; } mu_.unlock(); @@ -607,12 +590,8 @@ class KVStoreDist : public KVStoreLocal { * \brief threshold for partition */ size_t bigarray_bound_; - /// \brief send & recver buffer - std::unordered_map send_comm_buf_; - std::unordered_map recv_comm_buf_; std::unordered_map comm_buf_; - /// \brief small buffer for quantize std::unordered_map comm_small_buf_; /// \brief residual buffer for quantize From 237dc9b7dde79a57df6b71d1f96287545158815b Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Mon, 18 Sep 2017 23:33:54 +0000 Subject: [PATCH 045/237] fix --- src/kvstore/kvstore_dist_server.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index cb766493d61f..58538a58e454 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -33,7 +33,7 @@ #include #include "ps/ps.h" #include "mxnet/kvstore.h" -#include "../operator/tensor/elemwise_binary_op-inl.h" +#include "../operator/tensor/elemwise_binary_op.h" #include "../operator/tensor/init_op.h" #include "../ndarray/ndarray_function.h" @@ -291,14 +291,14 @@ class KVStoreDistServer { // instead of calling BinaryComputeRspRsp directly using namespace mshadow; Engine::Get()->PushSync([recved, merged, out](RunContext ctx) { - std::vector inputs, outputs; - inputs.push_back(recved); - inputs.push_back(merged.array); - outputs.push_back(out); - op::ElemwiseBinaryOp::ComputeEx( - {}, {}, inputs, {kWriteTo}, outputs); - }, recved.ctx(), const_vars, {out.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + std::vector inputs, outputs; + inputs.push_back(recved); + inputs.push_back(merged.array); + outputs.push_back(out); + op::ElemwiseBinaryOp::ComputeEx( + {}, {}, inputs, {kWriteTo}, outputs); + }, recved.ctx(), const_vars, {out.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); CopyFromTo(out, &merged.array, 0); } merged.request.push_back(req_meta); @@ -366,6 +366,7 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { + std::cout << "hererrrrrrrrrrrrrrrr" << std::endl; CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); @@ -381,8 +382,10 @@ class KVStoreDistServer { // could be deallocated when this function returns. so we need to make sure // the operators with \a NDArray are actually finished if (req_meta.push) { + std::cout << "pushhhhhhhhhhhhh" << std::endl; size_t ds[] = {(size_t)req_data.lens[0]}; TShape dshape(ds, ds + 1); + std::cout << "Recv shape: " << dshape.Size(); TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); @@ -391,6 +394,7 @@ class KVStoreDistServer { if (compress_ != "none") { long int original_size = (long int)(*(recv_blob.dptr()+2)); dshape = TShape{original_size}; + std::cout << "Uncompress shape: " << dshape.Size(); if (comp_buf.is_none()) { comp_buf = NDArray(dshape, Context()); } From 2ffcfebb6ef350326ad0c4de7981bb3cd2ecb4fa Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 19 Sep 2017 11:10:24 -0700 Subject: [PATCH 046/237] add GC test --- tests/nightly/dist_sync_kvstore.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index af1ecfc5036f..4b3248bd3922 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -166,11 +166,22 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): expected[row] = updated_val[row] check_diff_to_scalar(val, expected, rank=my_rank) - check_default_keys(kv, my_rank, nworker) + check_default_keys(kv, my_rank, nworker) check_row_sparse_keys(kv, my_rank, nworker) check_row_sparse_keys_with_zeros(kv, my_rank, nworker) check_big_row_sparse_keys(kv, my_rank, nworker) print('worker ' + str(my_rank) + ' is done') +def test_quantize(): + kv = mx.kv.create('dist_sync') + kv.set_compress('2bit') + kv.init(keys, [mx.nd.ones(big_shape)] * len(keys)) + my_rank = kv.rank + nworker = kv.num_workers + # init updater on servers + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) + kv.push('3',mx.nd.ones(shape)*(my_rank+1)) + if __name__ == "__main__": - test_sync_push_pull() + # test_sync_push_pull() + test_quantize() From c91cca3f21384f6ce699f87fd7c0eff483b2b923 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 19 Sep 2017 19:01:15 +0000 Subject: [PATCH 047/237] fix bug in push --- include/mxnet/c_api.h | 1 + include/mxnet/kvstore.h | 7 ++++--- python/mxnet/gluon/trainer.py | 5 +++-- python/mxnet/kvstore.py | 3 ++- python/mxnet/model.py | 4 ++-- src/c_api/c_api.cc | 3 ++- src/kvstore/kvstore.cc | 7 ++++--- src/kvstore/kvstore_dist.h | 13 +++++++++---- src/kvstore/kvstore_dist_server.h | 13 ++++++------- src/kvstore/kvstore_local.h | 3 ++- tests/nightly/dist_sync_kvstore.py | 23 ++++++++++++++++------- 11 files changed, 51 insertions(+), 31 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 474a6ddfae10..6420359e15c3 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1511,6 +1511,7 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars, * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreCreate(const char *type, + const char *compress, KVStoreHandle *out); /*! diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index b15746881116..f6d4f4589ae9 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -57,7 +57,8 @@ class KVStore { * - 'dist_*' : multi-machines * \return a new created KVStore. */ - static KVStore *Create(const char *type = "local"); + static KVStore *Create(const char *type = "local", + const char *compress = "none"); /** * \brief return the type @@ -406,12 +407,12 @@ class KVStore { /** * \brief positive threshold */ - float pos_threshold_; + float pos_threshold_ = 0.1; /** * \brief negative threshold */ - float neg_threshold_; + float neg_threshold_ = -0.1; /** * \brief whether to do barrier when finalize diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 26a32f463060..31803abb1024 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -49,7 +49,7 @@ class Trainer(object): positive threshold used in 2bit compression. neg_threshold: negative threshold used in 2bit compression. - + Properties ---------- learning_rate: float @@ -119,7 +119,8 @@ def _init_kvstore(self): arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), - arg_arrays) + arg_arrays, + self._compress) if kvstore: kvstore.set_compress(self._compress, self._pos_threshold, diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 30c4e5f45ba4..cda8fe86135e 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -541,7 +541,7 @@ def _send_command_to_servers(self, head, body): check_call(_LIB.MXKVStoreSendCommmandToServers( self.handle, mx_uint(head), c_str(body))) -def create(name='local'): +def create(name='local', compress='none'): """Creates a new KVStore. For single machine training, there are two commonly used types: @@ -580,5 +580,6 @@ def create(name='local'): raise TypeError('name must be a string') handle = KVStoreHandle() check_call(_LIB.MXKVStoreCreate(c_str(name), + c_str(compress), ctypes.byref(handle))) return KVStore(handle) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 2444ca0dc59e..e52f7aa8b23c 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -54,7 +54,7 @@ 'eval_metric', 'locals']) -def _create_kvstore(kvstore, num_device, arg_params): +def _create_kvstore(kvstore, num_device, arg_params, compress): """Create kvstore This function select and create a proper kvstore if given the kvstore type. @@ -78,7 +78,7 @@ def _create_kvstore(kvstore, num_device, arg_params): # no need to use kv for single device and single machine kv = None else: - kv = kvs.create(kvstore) + kv = kvs.create(kvstore, compress) if kvstore == 'local': # automatically select a proper local max_size = max(np.prod(param.shape) for param in diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 927d9d6d9637..2d162a39d2cb 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -721,9 +721,10 @@ int MXDataIterGetPadNum(DataIterHandle handle, int *pad) { } int MXKVStoreCreate(const char *type, + const char *compress, KVStoreHandle *out) { API_BEGIN(); - *out = KVStore::Create(type); + *out = KVStore::Create(type, compress); API_END(); } diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc index a288676102cb..f6e8745a56ce 100644 --- a/src/kvstore/kvstore.cc +++ b/src/kvstore/kvstore.cc @@ -31,7 +31,7 @@ namespace mxnet { -KVStore* KVStore::Create(const char *type_name) { +KVStore* KVStore::Create(const char *type_name, const char *comp) { std::string tname = type_name; std::transform(tname.begin(), tname.end(), tname.begin(), ::tolower); KVStore* kv = nullptr; @@ -42,10 +42,11 @@ KVStore* KVStore::Create(const char *type_name) { if (has("device")) { use_device_comm = true; } + std::string compress(comp); if (has("dist")) { #if MXNET_USE_DIST_KVSTORE - kv = new kvstore::KVStoreDist(use_device_comm); + kv = new kvstore::KVStoreDist(use_device_comm, compress); if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) { // configure the server to be the sync mode kv->SendCommandToServers(kvstore::kSyncMode, ""); @@ -55,7 +56,7 @@ KVStore* KVStore::Create(const char *type_name) { return nullptr; #endif // MXNET_USE_DIST_KVSTORE } else { - kv = new kvstore::KVStoreLocal(use_device_comm); + kv = new kvstore::KVStoreLocal(use_device_comm, compress); } kv->type_ = tname; return kv; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 95add6d9db81..3ad166a25416 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -52,8 +52,8 @@ namespace kvstore { */ class KVStoreDist : public KVStoreLocal { public: - explicit KVStoreDist(bool use_device_comm) - : KVStoreLocal(use_device_comm), ps_worker_(nullptr), server_(nullptr) { + explicit KVStoreDist(bool use_device_comm, std::string& comp) + : KVStoreLocal(use_device_comm, comp), ps_worker_(nullptr), server_(nullptr) { if (IsWorkerNode()) { ps_worker_ = new ps::KVWorker(0); ps::StartAsync("mxnet\0"); @@ -61,6 +61,7 @@ class KVStoreDist : public KVStoreLocal { ps::Postoffice::Get()->Barrier( ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); } + compress_ = comp; } bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); @@ -118,9 +119,8 @@ class KVStoreDist : public KVStoreLocal { void RunServer(const Controller& controller) override { CHECK(!IsWorkerNode()); if (IsServerNode()) { - server_ = new KVStoreDistServer(); + server_ = new KVStoreDistServer(compress_); server_->set_controller(controller); - server_->set_compress(compress_); } ps::StartAsync("mxnet_server\0"); @@ -140,6 +140,9 @@ class KVStoreDist : public KVStoreLocal { void InitImpl(const std::vector& keys, const std::vector& values) override { CheckUnique(keys); + if (IsServerNode()) { + server_->set_compress(compress_); + } for (size_t i = 0; i < keys.size(); ++i) { comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } @@ -259,6 +262,7 @@ class KVStoreDist : public KVStoreLocal { const std::vector& values, int priority, bool do_merge) { + std::cout << "worker: " << compress_ << std::endl; // first aggregate the values over keys std::vector uniq_keys; std::vector > grouped_vals; @@ -317,6 +321,7 @@ class KVStoreDist : public KVStoreLocal { // Compress if (compress_ == "2bit") { + std::cout << "worker: compress !!!" << std::endl; Quantize(send_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 58538a58e454..d3fdc506af75 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -109,7 +109,7 @@ class Executor { class KVStoreDistServer { public: - KVStoreDistServer() { + KVStoreDistServer(std::string comp) { using namespace std::placeholders; ps_server_ = new ps::KVServer(0); static_cast(ps_server_)->set_request_handle( @@ -118,6 +118,7 @@ class KVStoreDistServer { std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); + compress_ = comp; } ~KVStoreDistServer() { @@ -366,7 +367,8 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - std::cout << "hererrrrrrrrrrrrrrrr" << std::endl; + + std::cout << "server: " << compress_ << std::endl; CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); @@ -382,24 +384,21 @@ class KVStoreDistServer { // could be deallocated when this function returns. so we need to make sure // the operators with \a NDArray are actually finished if (req_meta.push) { - std::cout << "pushhhhhhhhhhhhh" << std::endl; size_t ds[] = {(size_t)req_data.lens[0]}; TShape dshape(ds, ds + 1); - std::cout << "Recv shape: " << dshape.Size(); TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); - + std::cout << "server: compress shape: " << dshape.Size() << std::endl; NDArray comp_buf = compress_buf_[key]; if (compress_ != "none") { long int original_size = (long int)(*(recv_blob.dptr()+2)); dshape = TShape{original_size}; - std::cout << "Uncompress shape: " << dshape.Size(); + std::cout << "server: Uncompress shape: " << dshape.Size() << std::endl; if (comp_buf.is_none()) { comp_buf = NDArray(dshape, Context()); } } - if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index f653339ffd7f..05eb441d295b 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -51,13 +51,14 @@ class KVStoreLocal : public KVStore { /* * \param use_device_comm */ - explicit KVStoreLocal(bool use_device_comm) : KVStore() { + explicit KVStoreLocal(bool use_device_comm, std::string& comp) : KVStore() { if (use_device_comm) { comm_ = new CommDevice(); } else { comm_ = new CommCPU(); } pinned_ctx_ = comm_->pinned_ctx(); + compress_ = comp; } virtual ~KVStoreLocal() { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index af1ecfc5036f..b69b139475b2 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -39,18 +39,19 @@ def check_diff_to_scalar(A, x, rank=None): def init_kv(): - kv = mx.kv.create('dist_sync') + kv = mx.kv.create('dist_sync', '2bit') # init kv dns keys - kv.init(keys, [mx.nd.ones(shape)] * len(keys)) - kv.init('99', mx.nd.ones(big_shape)) + #kv.init(keys, [mx.nd.ones(shape)] * len(keys)) + #kv.init('99', mx.nd.ones(big_shape)) + kv.init('99', mx.nd.ones(shape)) # init kv row_sparse keys - kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) - kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) + #kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) + #kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) # worker info my_rank = kv.rank nworker = kv.num_workers # init updater on servers - kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) + #kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) return kv, my_rank, nworker def test_sync_push_pull(): @@ -172,5 +173,13 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): check_big_row_sparse_keys(kv, my_rank, nworker) print('worker ' + str(my_rank) + ' is done') +def test_sync_push(): + kv, my_rank, nworker = init_kv() + kv.push('99', mx.nd.ones(shape)*(my_rank+1)) + val = mx.nd.zeros(shape) + kv.pull('99', out=val) + print val + if __name__ == "__main__": - test_sync_push_pull() + #test_sync_push_pull() + test_sync_push() From 54bb44b5f3019682dd9a497411a6cdc1064a7f2f Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 19 Sep 2017 21:15:28 +0000 Subject: [PATCH 048/237] fix push and pull --- python/mxnet/kvstore_server.py | 4 ++-- src/kvstore/kvstore_dist.h | 3 +-- src/kvstore/kvstore_dist_server.h | 6 ++++-- tests/nightly/dist_sync_kvstore.py | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py index 2504b4674a83..423cce83ac02 100644 --- a/python/mxnet/kvstore_server.py +++ b/python/mxnet/kvstore_server.py @@ -57,7 +57,7 @@ def server_controller(cmd_id, cmd_body, _): raise self.kvstore.set_optimizer(optimizer) else: - print("server %d, unknown command (%d, %s)" % ( + print ("server %d, unknown command (%d, %s)" % ( self.kvstore.rank, cmd_id, cmd_body)) return server_controller @@ -77,7 +77,7 @@ def _init_kvstore_server_module(): is_worker = ctypes.c_int() check_call(_LIB.MXKVStoreIsWorkerNode(ctypes.byref(is_worker))) if is_worker.value == 0: - kvstore = create('dist') + kvstore = create('dist', '2bit') server = KVStoreServer(kvstore) server.run() sys.exit() diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 3ad166a25416..85752e022ca7 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -61,8 +61,8 @@ class KVStoreDist : public KVStoreLocal { ps::Postoffice::Get()->Barrier( ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); } - compress_ = comp; } + compress_ = comp; bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } @@ -321,7 +321,6 @@ class KVStoreDist : public KVStoreLocal { // Compress if (compress_ == "2bit") { - std::cout << "worker: compress !!!" << std::endl; Quantize(send_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index d3fdc506af75..3d80b2fa33e0 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -367,8 +367,10 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - - std::cout << "server: " << compress_ << std::endl; + //if (compress_ != "none") { + std::cout << "server: " << compress_ << std::endl; + //} + //std::cout << "server: " << compress_ << std::endl; CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index b69b139475b2..3be4b3d98872 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -43,7 +43,7 @@ def init_kv(): # init kv dns keys #kv.init(keys, [mx.nd.ones(shape)] * len(keys)) #kv.init('99', mx.nd.ones(big_shape)) - kv.init('99', mx.nd.ones(shape)) + kv.init('99', mx.nd.zeros(shape)) # init kv row_sparse keys #kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) #kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) @@ -175,7 +175,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): def test_sync_push(): kv, my_rank, nworker = init_kv() - kv.push('99', mx.nd.ones(shape)*(my_rank+1)) + kv.push('99', mx.nd.ones(shape)) val = mx.nd.zeros(shape) kv.pull('99', out=val) print val From 91df1b3008750713c8d56ba2986405085eeef07b Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 19 Sep 2017 22:38:33 +0000 Subject: [PATCH 049/237] fix --- python/mxnet/module/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index d55b2117ebd3..22d6ab4bc6e6 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -484,7 +484,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', self._sync_params_from_devices() (kvstore, update_on_kvstore) = \ - _create_kvstore(kvstore, len(self._context), self._arg_params) + _create_kvstore(kvstore, len(self._context), self._arg_params, '2bit') batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: From ec8bbc75ddd2c0fbe8db01ba466a168b62f70a0a Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Tue, 19 Sep 2017 22:46:25 +0000 Subject: [PATCH 050/237] fix --- example/image-classification/common/fit.py | 3 +-- src/kvstore/kvstore_dist.h | 1 - src/kvstore/kvstore_dist_server.h | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 53ff11b25435..965712009303 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -113,8 +113,7 @@ def fit(args, network, data_loader, **kwargs): data_loader : function that returns the train and val data iterators """ # kvstore - kv = mx.kvstore.create(args.kv_store) - kv.set_compress('2bit', 0.1, -0.1) + kv = mx.kvstore.create(args.kv_store, '2bit') # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 85752e022ca7..04f57a268071 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -262,7 +262,6 @@ class KVStoreDist : public KVStoreLocal { const std::vector& values, int priority, bool do_merge) { - std::cout << "worker: " << compress_ << std::endl; // first aggregate the values over keys std::vector uniq_keys; std::vector > grouped_vals; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 3d80b2fa33e0..abf6aefa50ff 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -367,10 +367,6 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - //if (compress_ != "none") { - std::cout << "server: " << compress_ << std::endl; - //} - //std::cout << "server: " << compress_ << std::endl; CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); From 39f2e4418188f8d4e310b615c966c510b6a788c3 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 19 Sep 2017 16:50:20 -0700 Subject: [PATCH 051/237] uncompiled --- include/mxnet/c_api.h | 1 - include/mxnet/kvstore.h | 15 ++++++++--- python/mxnet/gluon/trainer.py | 25 ++++++++----------- python/mxnet/kvstore.py | 40 ++++++++++++++++++++++-------- python/mxnet/kvstore_server.py | 2 +- python/mxnet/model.py | 4 +-- python/mxnet/module/module.py | 1 + src/c_api/c_api.cc | 3 +-- src/kvstore/kvstore.cc | 7 +++--- src/kvstore/kvstore_dist.h | 14 +++++------ src/kvstore/kvstore_dist_server.h | 37 ++++++++++++++++++++++++--- src/kvstore/kvstore_local.h | 3 +-- tests/nightly/dist_sync_kvstore.py | 3 ++- 13 files changed, 100 insertions(+), 55 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 6420359e15c3..474a6ddfae10 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1511,7 +1511,6 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars, * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreCreate(const char *type, - const char *compress, KVStoreHandle *out); /*! diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index f6d4f4589ae9..3780d6562dee 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -57,8 +57,7 @@ class KVStore { * - 'dist_*' : multi-machines * \return a new created KVStore. */ - static KVStore *Create(const char *type = "local", - const char *compress = "none"); + static KVStore *Create(const char *type = "local"); /** * \brief return the type @@ -76,6 +75,14 @@ class KVStore { neg_threshold_ = neg_threshold; } + std::string& GetCompressParams() { + std::string rval = compress_; + if (compress_ == "2bit") { + rval += "," + std::to_string(pos_threshold_) + "," + std::to_string(neg_threshold_); + } + return rval; + } + /*! * \brief Initialize a list of key-value pair to the store. * @@ -405,12 +412,12 @@ class KVStore { std::string compress_ = "none"; /** - * \brief positive threshold + * \brief positive threshold for 2bit compression */ float pos_threshold_ = 0.1; /** - * \brief negative threshold + * \brief negative threshold for 2bit compression */ float neg_threshold_ = -0.1; diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 31803abb1024..7600a21e493f 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -57,7 +57,7 @@ class Trainer(object): optimizer, its learning rate can be accessed as optimizer.learning_rate. """ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', - compress='none', pos_threshold=0.1, neg_threshold=-0.1): + compress_params=None): if isinstance(params, (dict, ParameterDict)): params = list(params.values()) if not isinstance(params, (list, tuple)): @@ -71,23 +71,21 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) self._params.append(param) - if (compress != 'none' and - compress != '2bit' and compress != '1bit'): - raise ValueError("The compress argument can only be 'none', " \ + if compress_params and compress_params.compress != 'none': + if (compress_params.compress != '2bit' and compress_params.compress != '1bit'): + raise ValueError("The compress argument can only be 'none', " \ "'2bit', or '1bit'.") - if (compress == '2bit' and - (pos_threshold <= 0 or neg_threshold >= 0)): - raise ValueError("The pos_threshold must be greater than 0, and " \ + if (compress_params.compress == '2bit' + and (compress_params.pos_threshold <= 0 or compress_params.neg_threshold >= 0)): + raise ValueError("The pos_threshold must be greater than 0, and " \ "the neg_threshold must be less than 0.") + self._compress_params = compress_params if compress_params else {'compress':'none'} optimizer_params = optimizer_params if optimizer_params else {} self._scale = optimizer_params.get('rescale_grad', 1.0) self._contexts = self._check_contexts() self._init_optimizer(optimizer, optimizer_params) self._kv_initialized = False self._kvstore = kvstore - self._compress = compress - self._pos_threshold = pos_threshold - self._neg_threshold = neg_threshold def _check_contexts(self): contexts = None @@ -119,12 +117,9 @@ def _init_kvstore(self): arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), - arg_arrays, - self._compress) + arg_arrays) if kvstore: - kvstore.set_compress(self._compress, - self._pos_threshold, - self._neg_threshold) + kvstore.set_compress(self._compress_params) if 'dist' in kvstore.type: update_on_kvstore = False for i, param in enumerate(self._params): diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index cda8fe86135e..ee70c2058ea2 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -24,7 +24,7 @@ from .ndarray import NDArray from .ndarray import _ndarray_cls from .base import _LIB -from .base import check_call, c_array, c_str, string_types, mx_uint, mx_float, py_str +from .base import check_call, c_array, c_str, string_types, numeric_types, mx_uint, mx_float, py_str from .base import NDArrayHandle, KVStoreHandle from . import optimizer as opt @@ -346,19 +346,38 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_compress(self, compress='none', - pos_threshold=0.1, - neg_threshold=-0.1): + def set_compress(self, compress_params={'compress':'none'}): """ Set to use low-bit compression compress can be 'none', '2bit', or '1bit'. """ - if not isinstance(compress, string_types): + if not isinstance(compress_params.compress, string_types): raise TypeError('compress must be a string') - check_call(_LIB.MXKVStoreSetCompress(self.handle, - c_str(compress), - mx_float(pos_threshold), - mx_float(neg_threshold))) + if compress_params.compress == '2bit': + if 'pos_threshold' in compress_params: + if not isinstance(compress_params.pos_threshold, numeric_types): + raise TypeError('pos_threshold must be a numeric type') + else: + compress_params.pos_threshold = 0.1 + + if 'neg_threshold' in compress_params: + if not isinstance(compress_params.pos_threshold, numeric_types)): + raise TypeError('pos_threshold must be a numeric type') + else: + compress_params.neg_threshold = -0.1 + + if (compress_params.pos_threshold <= 0 or compress_params.neg_threshold >= 0): + raise ValueError('pos_threshold needs to be greater than 0, \ + and neg_threshold needs to be less than 0') + + if compress_params.compress not in ['none','2bit']: + raise ValueError('Unsupported type of compression') + + if compress_params.compress == '2bit': + check_call(_LIB.MXKVStoreSetCompress(self.handle, + c_str(compress_params.compress), + mx_float(compress_params.pos_threshold), + mx_float(compress_params.neg_threshold))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. @@ -541,7 +560,7 @@ def _send_command_to_servers(self, head, body): check_call(_LIB.MXKVStoreSendCommmandToServers( self.handle, mx_uint(head), c_str(body))) -def create(name='local', compress='none'): +def create(name='local'): """Creates a new KVStore. For single machine training, there are two commonly used types: @@ -580,6 +599,5 @@ def create(name='local', compress='none'): raise TypeError('name must be a string') handle = KVStoreHandle() check_call(_LIB.MXKVStoreCreate(c_str(name), - c_str(compress), ctypes.byref(handle))) return KVStore(handle) diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py index 423cce83ac02..1bb995a45ca8 100644 --- a/python/mxnet/kvstore_server.py +++ b/python/mxnet/kvstore_server.py @@ -77,7 +77,7 @@ def _init_kvstore_server_module(): is_worker = ctypes.c_int() check_call(_LIB.MXKVStoreIsWorkerNode(ctypes.byref(is_worker))) if is_worker.value == 0: - kvstore = create('dist', '2bit') + kvstore = create('dist') server = KVStoreServer(kvstore) server.run() sys.exit() diff --git a/python/mxnet/model.py b/python/mxnet/model.py index e52f7aa8b23c..2444ca0dc59e 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -54,7 +54,7 @@ 'eval_metric', 'locals']) -def _create_kvstore(kvstore, num_device, arg_params, compress): +def _create_kvstore(kvstore, num_device, arg_params): """Create kvstore This function select and create a proper kvstore if given the kvstore type. @@ -78,7 +78,7 @@ def _create_kvstore(kvstore, num_device, arg_params, compress): # no need to use kv for single device and single machine kv = None else: - kv = kvs.create(kvstore, compress) + kv = kvs.create(kvstore) if kvstore == 'local': # automatically select a proper local max_size = max(np.prod(param.shape) for param in diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index d55b2117ebd3..534e6a38aa09 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -521,6 +521,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', self._updater = None if kvstore: + kvstore.set_compress(self._compress_params) # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 2d162a39d2cb..927d9d6d9637 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -721,10 +721,9 @@ int MXDataIterGetPadNum(DataIterHandle handle, int *pad) { } int MXKVStoreCreate(const char *type, - const char *compress, KVStoreHandle *out) { API_BEGIN(); - *out = KVStore::Create(type, compress); + *out = KVStore::Create(type); API_END(); } diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc index f6e8745a56ce..a288676102cb 100644 --- a/src/kvstore/kvstore.cc +++ b/src/kvstore/kvstore.cc @@ -31,7 +31,7 @@ namespace mxnet { -KVStore* KVStore::Create(const char *type_name, const char *comp) { +KVStore* KVStore::Create(const char *type_name) { std::string tname = type_name; std::transform(tname.begin(), tname.end(), tname.begin(), ::tolower); KVStore* kv = nullptr; @@ -42,11 +42,10 @@ KVStore* KVStore::Create(const char *type_name, const char *comp) { if (has("device")) { use_device_comm = true; } - std::string compress(comp); if (has("dist")) { #if MXNET_USE_DIST_KVSTORE - kv = new kvstore::KVStoreDist(use_device_comm, compress); + kv = new kvstore::KVStoreDist(use_device_comm); if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) { // configure the server to be the sync mode kv->SendCommandToServers(kvstore::kSyncMode, ""); @@ -56,7 +55,7 @@ KVStore* KVStore::Create(const char *type_name, const char *comp) { return nullptr; #endif // MXNET_USE_DIST_KVSTORE } else { - kv = new kvstore::KVStoreLocal(use_device_comm, compress); + kv = new kvstore::KVStoreLocal(use_device_comm); } kv->type_ = tname; return kv; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 85752e022ca7..b009f381c3b5 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -52,17 +52,19 @@ namespace kvstore { */ class KVStoreDist : public KVStoreLocal { public: - explicit KVStoreDist(bool use_device_comm, std::string& comp) - : KVStoreLocal(use_device_comm, comp), ps_worker_(nullptr), server_(nullptr) { + explicit KVStoreDist(bool use_device_comm) + : KVStoreLocal(use_device_comm), ps_worker_(nullptr), server_(nullptr) { if (IsWorkerNode()) { ps_worker_ = new ps::KVWorker(0); ps::StartAsync("mxnet\0"); if (!ps::Postoffice::Get()->is_recovery()) { + if (get_rank() == 0) { + SendCommandToServers(kSetCompress, GetCompressParams()); + } ps::Postoffice::Get()->Barrier( ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); } } - compress_ = comp; bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } @@ -119,7 +121,7 @@ class KVStoreDist : public KVStoreLocal { void RunServer(const Controller& controller) override { CHECK(!IsWorkerNode()); if (IsServerNode()) { - server_ = new KVStoreDistServer(compress_); + server_ = new KVStoreDistServer(); server_->set_controller(controller); } @@ -140,9 +142,6 @@ class KVStoreDist : public KVStoreLocal { void InitImpl(const std::vector& keys, const std::vector& values) override { CheckUnique(keys); - if (IsServerNode()) { - server_->set_compress(compress_); - } for (size_t i = 0; i < keys.size(); ++i) { comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } @@ -262,7 +261,6 @@ class KVStoreDist : public KVStoreLocal { const std::vector& values, int priority, bool do_merge) { - std::cout << "worker: " << compress_ << std::endl; // first aggregate the values over keys std::vector uniq_keys; std::vector > grouped_vals; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 3d80b2fa33e0..9a372a40f50c 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -44,6 +44,17 @@ static const int kRowSparsePushPull = 1; static const int kDefaultPushPull = 0; static const int kStopServer = -1; static const int kSyncMode = -2; +static const int kSetCompress = 2; + +template +void split(const std::string &s, char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } +} /** * \brief executor runs a function using the thread called \ref Start @@ -109,7 +120,7 @@ class Executor { class KVStoreDistServer { public: - KVStoreDistServer(std::string comp) { + KVStoreDistServer() { using namespace std::placeholders; ps_server_ = new ps::KVServer(0); static_cast(ps_server_)->set_request_handle( @@ -118,7 +129,6 @@ class KVStoreDistServer { std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); - compress_ = comp; } ~KVStoreDistServer() { @@ -130,8 +140,14 @@ class KVStoreDistServer { controller_ = controller; } - void set_compress(const std::string& compress) { - compress_ = compress; + void load_compress_params(std::string& params) { + std::vector elems; + split(params, ',', std::back_inserter(elems)); + compress_ = elems[0]; + if (elems.size() > 1) { + pos_threshold_ = strtof(elems[1]); + neg_threshold_ = strtof(elems[2]); + } } void set_updater(const KVStore::Updater& updater) { @@ -157,6 +173,8 @@ class KVStoreDistServer { exec_.Stop(); } else if (recved.head == kSyncMode) { sync_mode_ = true; + } else if (recved.head == kSetCompress) { + load_compress_params(recved.body); } else { // let the main thread to execute ctrl, which is necessary for python exec_.Exec([this, recved]() { @@ -480,6 +498,17 @@ class KVStoreDistServer { // set to use gradient compression std::string compress_; + + /** + * \brief positive threshold for 2bit compression + */ + float pos_threshold_ = 0.1; + + /** + * \brief negative threshold for 2bit compression + */ + float neg_threshold_ = -0.1; + }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 05eb441d295b..f653339ffd7f 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -51,14 +51,13 @@ class KVStoreLocal : public KVStore { /* * \param use_device_comm */ - explicit KVStoreLocal(bool use_device_comm, std::string& comp) : KVStore() { + explicit KVStoreLocal(bool use_device_comm) : KVStore() { if (use_device_comm) { comm_ = new CommDevice(); } else { comm_ = new CommCPU(); } pinned_ctx_ = comm_->pinned_ctx(); - compress_ = comp; } virtual ~KVStoreLocal() { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 7283c135f32f..6f27413df8c7 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -172,7 +172,8 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): print('worker ' + str(my_rank) + ' is done') def test_compressed(): - kv = mx.kv.create('dist_sync','2bit') + kv = mx.kv.create('dist_sync') + kv.set_compress({'compress':'2bit'}) kv.init(keys, [mx.nd.ones(big_shape)] * len(keys)) my_rank = kv.rank nworker = kv.num_workers From fd42f8c4f52b7ef074ffa3acf51c5e858e23032b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 19 Sep 2017 17:13:26 -0700 Subject: [PATCH 052/237] kvstore dist changes. added cpp_package. changed strtof function calls --- cpp-package/include/mxnet-cpp/kvstore.h | 1 + cpp-package/include/mxnet-cpp/kvstore.hpp | 6 ++++++ include/mxnet/kvstore.h | 2 +- src/kvstore/kvstore_dist_server.h | 6 +++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h index 9c3c81f37ff7..3c5e445f30ba 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.h +++ b/cpp-package/include/mxnet-cpp/kvstore.h @@ -37,6 +37,7 @@ class KVStore { public: static void SetType(const std::string& type); static void RunServer(); + static void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold); static void Init(int key, const NDArray& val); static void Init(const std::vector& keys, const std::vector& vals); static void Push(int key, const NDArray& val, int priority = 0); diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp index f2b5e74990ce..b32a6dcbc770 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.hpp +++ b/cpp-package/include/mxnet-cpp/kvstore.hpp @@ -82,6 +82,12 @@ inline void KVStore::RunServer() { CHECK_EQ(MXKVStoreRunServer(get_kvstore()->get_handle(), &Controller, 0), 0); } +inline void KVStore::SetCompress(const std::string& compress, + const float pos_threshold, const float neg_threshold) { + CHECK_EQ(MXKVStoreSetCompress(get_kvstore()->get_handle(), + compress.c_str(), pos_threshold, neg_threshold),0); +} + inline void KVStore::Init(int key, const NDArray& val) { NDArrayHandle val_handle = val.GetHandle(); CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), 1, &key, &val_handle), 0); diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 3780d6562dee..3d607b8b6821 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -75,7 +75,7 @@ class KVStore { neg_threshold_ = neg_threshold; } - std::string& GetCompressParams() { + std::string GetCompressParams() { std::string rval = compress_; if (compress_ == "2bit") { rval += "," + std::to_string(pos_threshold_) + "," + std::to_string(neg_threshold_); diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 9a372a40f50c..a3110021f079 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -140,13 +140,13 @@ class KVStoreDistServer { controller_ = controller; } - void load_compress_params(std::string& params) { + void load_compress_params(const std::string& params) { std::vector elems; split(params, ',', std::back_inserter(elems)); compress_ = elems[0]; if (elems.size() > 1) { - pos_threshold_ = strtof(elems[1]); - neg_threshold_ = strtof(elems[2]); + pos_threshold_ = strtof(elems[1].c_str(), NULL); + neg_threshold_ = strtof(elems[2].c_str(), NULL); } } From f743ab150c5f8ff445c47b113e10fa45b12495d0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 19 Sep 2017 17:38:58 -0700 Subject: [PATCH 053/237] fix usage of keys in dict --- python/mxnet/kvstore.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index ee70c2058ea2..7c4880600f77 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -346,38 +346,40 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_compress(self, compress_params={'compress':'none'}): + def set_compress(self, compress_params={}): """ Set to use low-bit compression compress can be 'none', '2bit', or '1bit'. """ - if not isinstance(compress_params.compress, string_types): + if 'compress' not in compress_params: + compress_params['compress'] = 'none' + elif not isinstance(compress_params['compress'], string_types): raise TypeError('compress must be a string') - if compress_params.compress == '2bit': + elif compress_params['compress'] not in ['none','2bit']: + raise ValueError('Unsupported type of compression') + + if compress_params['compress'] == '2bit': if 'pos_threshold' in compress_params: - if not isinstance(compress_params.pos_threshold, numeric_types): + if not isinstance(compress_params['pos_threshold'], numeric_types): raise TypeError('pos_threshold must be a numeric type') else: - compress_params.pos_threshold = 0.1 + compress_params['pos_threshold'] = 0.1 if 'neg_threshold' in compress_params: - if not isinstance(compress_params.pos_threshold, numeric_types)): - raise TypeError('pos_threshold must be a numeric type') + if not isinstance(compress_params['neg_threshold'], numeric_types): + raise TypeError('neg_threshold must be a numeric type') else: - compress_params.neg_threshold = -0.1 + compress_params['neg_threshold'] = -0.1 - if (compress_params.pos_threshold <= 0 or compress_params.neg_threshold >= 0): + if (compress_params['pos_threshold'] <= 0 or compress_params['neg_threshold'] >= 0): raise ValueError('pos_threshold needs to be greater than 0, \ and neg_threshold needs to be less than 0') - if compress_params.compress not in ['none','2bit']: - raise ValueError('Unsupported type of compression') - - if compress_params.compress == '2bit': + if compress_params['compress'] == '2bit': check_call(_LIB.MXKVStoreSetCompress(self.handle, - c_str(compress_params.compress), - mx_float(compress_params.pos_threshold), - mx_float(compress_params.neg_threshold))) + c_str(compress_params['compress']), + mx_float(compress_params['pos_threshold']), + mx_float(compress_params['neg_threshold']))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. From e18331fb779c7a903e240e3c3aa8d4fb10a7eaa7 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 20 Sep 2017 18:18:31 +0000 Subject: [PATCH 054/237] fix push and pull --- src/kvstore/kvstore_dist.h | 48 +++++++++++++++++++++++++++--- src/kvstore/kvstore_dist_server.h | 5 ++-- tests/nightly/dist_sync_kvstore.py | 19 ++++++++---- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 04f57a268071..9da230fd93ad 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -190,7 +190,7 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size, false); + PSKV* pskv = new_EncodeKey(key, size); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -199,7 +199,7 @@ class KVStoreDist : public KVStoreLocal { auto vals = new ps::SArray(data, size, false); // issue pull CHECK_NOTNULL(ps_worker_)->ZPull( - pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); + pskv->keys, vals, &(pskv->lens), kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( @@ -337,7 +337,7 @@ class KVStoreDist : public KVStoreLocal { } else { size = small_buf.shape().Size(); } - PSKV& pskv = EncodeKey(key, size, true); + PSKV* pskv = new_EncodeKey(key, size); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); @@ -351,7 +351,7 @@ class KVStoreDist : public KVStoreLocal { // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); + pskv->keys, vals, pskv->lens, 0, [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, @@ -476,10 +476,44 @@ class KVStoreDist : public KVStoreLocal { */ std::mutex mu_; + inline PSKV* new_EncodeKey(int key, size_t size) { + PSKV* pskv = new PSKV; + auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); + + // a simple heuristic for load balance + if (size < bigarray_bound_) { + // send it to a single random picked server + int server = (key * 9973) % num_servers; + ps::Key ps_key = krs[server].begin() + key; + CHECK_LT(ps_key, krs[server].end()); + pskv->keys.push_back(ps_key); + pskv->lens.push_back(size); + pskv->size = size; + } else { + // parition it to all servers + pskv->size = 0; + for (int i = 0; i < num_servers; ++i) { + size_t part_size = + static_cast(round(static_cast(size)/num_servers*(i+1))) - + static_cast(round(static_cast(size)/num_servers*i)); + ps::Key ps_key = krs[i].begin() + key; + CHECK_LT(ps_key, krs[i].end()); + pskv->keys.push_back(ps_key); + pskv->lens.push_back(part_size); + pskv->size += part_size; + } + CHECK_EQ(static_cast(pskv->size), size); + } + return pskv; + } + /** * \brief convert to keys in ps */ inline PSKV& EncodeKey(int key, size_t size, bool is_push) { + mu_.lock(); PSKV& pskv = push_ps_kv_[key]; if (!is_push) { @@ -487,6 +521,11 @@ class KVStoreDist : public KVStoreLocal { } mu_.unlock(); + if (is_push) { + std::cout << "[push] key: " << key << " pskv_size: " << pskv.size << " arg size: " << size << std::endl; + } else { + std::cout << "[pull] key: " << key << " pskv_size: " << pskv.size << " arg size: " << size << std::endl; + } if (!pskv.keys.empty()) { // For compress, we cannt check here CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; @@ -520,6 +559,7 @@ class KVStoreDist : public KVStoreLocal { CHECK_EQ(static_cast(pskv.size), size); } } + return pskv; } diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index abf6aefa50ff..8fedd7224ace 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -367,6 +367,7 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { + CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); @@ -387,12 +388,12 @@ class KVStoreDistServer { TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); - std::cout << "server: compress shape: " << dshape.Size() << std::endl; + // std::cout << "server: compress shape: " << dshape.Size() << std::endl; NDArray comp_buf = compress_buf_[key]; if (compress_ != "none") { long int original_size = (long int)(*(recv_blob.dptr()+2)); dshape = TShape{original_size}; - std::cout << "server: Uncompress shape: " << dshape.Size() << std::endl; + //std::cout << "server: Uncompress shape: " << dshape.Size() << std::endl; if (comp_buf.is_none()) { comp_buf = NDArray(dshape, Context()); } diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 3be4b3d98872..0cd644f7033e 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -34,7 +34,8 @@ def check_diff_to_scalar(A, x, rank=None): rsp_keys = ['9', '11', '13'] rate = 2 -shape = (2, 3) +shape_1 = (2, 3) +shape_2 = (12, 15) big_shape = (1200, 1200) # bigger than BIGARRAY_BOUND @@ -43,7 +44,8 @@ def init_kv(): # init kv dns keys #kv.init(keys, [mx.nd.ones(shape)] * len(keys)) #kv.init('99', mx.nd.ones(big_shape)) - kv.init('99', mx.nd.zeros(shape)) + kv.init('99', mx.nd.zeros(shape_1)) + kv.init('3', mx.nd.zeros(shape_2)) # init kv row_sparse keys #kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys)) #kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse')) @@ -175,10 +177,15 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): def test_sync_push(): kv, my_rank, nworker = init_kv() - kv.push('99', mx.nd.ones(shape)) - val = mx.nd.zeros(shape) - kv.pull('99', out=val) - print val + val_1 = mx.nd.zeros(shape_1) + val_2 = mx.nd.zeros(shape_2) + for i in range(100): + kv.push('99', mx.nd.ones(shape_1)) + kv.pull('99', out=val_1) + kv.push('3', mx.nd.ones(shape_2)) + kv.pull('3', out=val_2) + print val_1 + print val_2 if __name__ == "__main__": #test_sync_push_pull() From 7ec80edb6f37c8509f82c88ce77631f8479d6b32 Mon Sep 17 00:00:00 2001 From: aksnzhy Date: Wed, 20 Sep 2017 19:30:08 +0000 Subject: [PATCH 055/237] fix --- src/kvstore/kvstore_dist.h | 53 ++++---------------------------------- 1 file changed, 5 insertions(+), 48 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 9da230fd93ad..5060565ae975 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -190,7 +190,7 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV* pskv = new_EncodeKey(key, size); + PSKV& pskv = EncodeKey(key, size, false); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -199,7 +199,7 @@ class KVStoreDist : public KVStoreLocal { auto vals = new ps::SArray(data, size, false); // issue pull CHECK_NOTNULL(ps_worker_)->ZPull( - pskv->keys, vals, &(pskv->lens), kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); + pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( @@ -337,7 +337,7 @@ class KVStoreDist : public KVStoreLocal { } else { size = small_buf.shape().Size(); } - PSKV* pskv = new_EncodeKey(key, size); + PSKV& pskv = EncodeKey(key, size, true); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); @@ -351,7 +351,7 @@ class KVStoreDist : public KVStoreLocal { // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv->keys, vals, pskv->lens, 0, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, @@ -476,56 +476,13 @@ class KVStoreDist : public KVStoreLocal { */ std::mutex mu_; - inline PSKV* new_EncodeKey(int key, size_t size) { - PSKV* pskv = new PSKV; - auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); - int num_servers = krs.size(); - CHECK_GT(num_servers, 0); - - // a simple heuristic for load balance - if (size < bigarray_bound_) { - // send it to a single random picked server - int server = (key * 9973) % num_servers; - ps::Key ps_key = krs[server].begin() + key; - CHECK_LT(ps_key, krs[server].end()); - pskv->keys.push_back(ps_key); - pskv->lens.push_back(size); - pskv->size = size; - } else { - // parition it to all servers - pskv->size = 0; - for (int i = 0; i < num_servers; ++i) { - size_t part_size = - static_cast(round(static_cast(size)/num_servers*(i+1))) - - static_cast(round(static_cast(size)/num_servers*i)); - ps::Key ps_key = krs[i].begin() + key; - CHECK_LT(ps_key, krs[i].end()); - pskv->keys.push_back(ps_key); - pskv->lens.push_back(part_size); - pskv->size += part_size; - } - CHECK_EQ(static_cast(pskv->size), size); - } - return pskv; - } - /** * \brief convert to keys in ps */ inline PSKV& EncodeKey(int key, size_t size, bool is_push) { - mu_.lock(); - PSKV& pskv = push_ps_kv_[key]; - if (!is_push) { - pskv = pull_ps_kv_[key]; - } + PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; mu_.unlock(); - - if (is_push) { - std::cout << "[push] key: " << key << " pskv_size: " << pskv.size << " arg size: " << size << std::endl; - } else { - std::cout << "[pull] key: " << key << " pskv_size: " << pskv.size << " arg size: " << size << std::endl; - } if (!pskv.keys.empty()) { // For compress, we cannt check here CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; From 365786912ab4fe3776d16238416f71628f602edb Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 20 Sep 2017 14:09:31 -0700 Subject: [PATCH 056/237] fix_test --- tests/nightly/dist_sync_kvstore.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 3832ffba9a55..9c7bd2a3fe17 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -174,7 +174,9 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): def test_compressed(): kv = mx.kv.create('dist_sync') - kv.set_compress({'compress':'2bit'}) + pos_threshold = 0.5 + neg_threshold = -0.5 + kv.set_compress({'compress':'2bit', 'pos_threshold': pos_threshold , 'neg_threshold': neg_threshold}) # init kv dns keys kv.init('99', mx.nd.ones(big_shape)) kv.init('3', mx.nd.ones(shape)) @@ -188,14 +190,14 @@ def check_default_keys(kv, my_rank, nworker): num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1 val = mx.nd.zeros(shape) kv.pull('3', out=val) - check_diff_to_scalar(val, num) + check_diff_to_scalar(val, pos_threshold) val2 = mx.nd.zeros(big_shape) kv.pull('99', out=val2) - check_diff_to_scalar(val2, num) + check_diff_to_scalar(val2, pos_threshold) check_default_keys(kv, kv.rank, kv.num_workers) if __name__ == "__main__": test_sync_push_pull() - test_compressed() \ No newline at end of file + test_compressed() From 95e073e9d5fa590a385b065239eae4c256438d87 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 20 Sep 2017 14:09:34 -0700 Subject: [PATCH 057/237] fix_test --- tests/nightly/dist_sync_kvstore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 9c7bd2a3fe17..dea47856b691 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -172,6 +172,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): check_big_row_sparse_keys(kv, my_rank, nworker) print('worker ' + str(my_rank) + ' is done') + def test_compressed(): kv = mx.kv.create('dist_sync') pos_threshold = 0.5 From d595fa53717c5e80125053ecd1f9f22aa19fdf85 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 20 Sep 2017 14:09:46 -0700 Subject: [PATCH 058/237] fix_test --- tests/nightly/dist_sync_kvstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index dea47856b691..ff885c0832f7 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -177,7 +177,7 @@ def test_compressed(): kv = mx.kv.create('dist_sync') pos_threshold = 0.5 neg_threshold = -0.5 - kv.set_compress({'compress':'2bit', 'pos_threshold': pos_threshold , 'neg_threshold': neg_threshold}) + kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) # init kv dns keys kv.init('99', mx.nd.ones(big_shape)) kv.init('3', mx.nd.ones(shape)) From f6e2b92e293f6621c18beed44ab78e648dd362db Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Sep 2017 23:49:08 +0000 Subject: [PATCH 059/237] add print statements --- src/kvstore/kvstore_dist.h | 2 ++ src/kvstore/kvstore_dist_server.h | 1 + 2 files changed, 3 insertions(+) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index c1a070db85d5..ff45747213cd 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -57,8 +57,10 @@ class KVStoreDist : public KVStoreLocal { if (IsWorkerNode()) { ps_worker_ = new ps::KVWorker(0); ps::StartAsync("mxnet\0"); + //what happens during recovery? if (!ps::Postoffice::Get()->is_recovery()) { if (get_rank() == 0) { + std::cout<Barrier( diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 3441027b6c5f..95f18c9e9669 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -141,6 +141,7 @@ class KVStoreDistServer { } void load_compress_params(const std::string& params) { + std::cout< elems; split(params, ',', std::back_inserter(elems)); compress_ = elems[0]; From 6cf214e158ec54f080db8a2b81f9c631decd819f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 20 Sep 2017 17:10:00 -0700 Subject: [PATCH 060/237] more print statements and move send command to server --- include/mxnet/kvstore.h | 1 + python/mxnet/kvstore.py | 1 + src/kvstore/kvstore_dist.h | 14 ++++++++++---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 3d607b8b6821..f847ac0f0a60 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -73,6 +73,7 @@ class KVStore { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; + std::cout<<"Just set"<is_recovery()) { - if (get_rank() == 0) { - std::cout<Barrier( ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); } @@ -95,6 +91,16 @@ class KVStoreDist : public KVStoreLocal { } } + void SetCompress(const std::string& compress, + const float pos_threshold, + const float neg_threshold) { + KVStore::SetCompress(compress, pos_threshold, neg_threshold); + if (get_rank() == 0) { + std::cout << GetCompressParams() << std::endl; + SendCommandToServers(kSetCompress, GetCompressParams()); + } + } + void Barrier() override { ps::Postoffice::Get()->Barrier(ps::kWorkerGroup); } From 4b0e756b4ed41bc729d268e8fe7f4248b847eb78 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 20 Sep 2017 17:21:04 -0700 Subject: [PATCH 061/237] set compress handling --- include/mxnet/kvstore.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index f847ac0f0a60..afddab86b7db 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -67,13 +67,12 @@ class KVStore { /** * \brief set to use low-bit compression */ - inline void SetCompress(const std::string& compress, + void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold) { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; - std::cout<<"Just set"< Date: Thu, 21 Sep 2017 10:45:51 -0700 Subject: [PATCH 062/237] kvstore dist changes --- src/kvstore/kvstore_dist_server.h | 2 ++ src/ndarray/ndarray.cc | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 95f18c9e9669..2dba7aedacfc 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "ps/ps.h" #include "mxnet/kvstore.h" #include "../operator/tensor/elemwise_binary_op.h" diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 54ad466340f2..b4f83ddc7cde 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -615,7 +615,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { - LOG(FATAL) << "Unsupported dequantization"; + LOG(FATAL) << "Unsupported dequantization"< Date: Thu, 21 Sep 2017 18:05:55 -0700 Subject: [PATCH 063/237] working kvstore push and pull. not sure if I commited that. from this commit removing mutable variable changes for residual array gives working push and pull --- include/mxnet/kvstore.h | 9 ++------- python/mxnet/kvstore.py | 1 - src/kvstore/kvstore_dist.h | 8 +++----- src/kvstore/kvstore_dist_server.h | 5 ++--- src/kvstore/kvstore_local.h | 7 +++++++ src/ndarray/ndarray.cc | 10 +++++++--- tests/nightly/dist_sync_kvstore.py | 13 +++++++++---- 7 files changed, 30 insertions(+), 23 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index afddab86b7db..b7bdbba122bd 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -67,13 +67,8 @@ class KVStore { /** * \brief set to use low-bit compression */ - void SetCompress(const std::string& compress, - const float pos_threshold, - const float neg_threshold) { - compress_ = compress; - pos_threshold_ = pos_threshold; - neg_threshold_ = neg_threshold; - } + virtual void SetCompress(const std::string& compress, const float pos_threshold, + const float neg_threshold) = 0; std::string GetCompressParams() { std::string rval = compress_; diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 5690faa0e823..7c4880600f77 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -376,7 +376,6 @@ def set_compress(self, compress_params={}): and neg_threshold needs to be less than 0') if compress_params['compress'] == '2bit': - print 'Calling CPP KVStoreSetCompress' check_call(_LIB.MXKVStoreSetCompress(self.handle, c_str(compress_params['compress']), mx_float(compress_params['pos_threshold']), diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 1914fdfad42f..7fee417edcdd 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -91,12 +91,10 @@ class KVStoreDist : public KVStoreLocal { } } - void SetCompress(const std::string& compress, - const float pos_threshold, + virtual void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold) { - KVStore::SetCompress(compress, pos_threshold, neg_threshold); + KVStoreLocal::SetCompress(compress, pos_threshold, neg_threshold); if (get_rank() == 0) { - std::cout << GetCompressParams() << std::endl; SendCommandToServers(kSetCompress, GetCompressParams()); } } @@ -324,7 +322,6 @@ class KVStoreDist : public KVStoreLocal { neg_thre_ = neg_threshold_; } } - // Compress if (compress_ == "2bit") { Quantize(send_buf, &small_buf, &res_buf, @@ -332,6 +329,7 @@ class KVStoreDist : public KVStoreLocal { compress_, priority); } + std::cout<<"finished compress"< elems; split(params, ',', std::back_inserter(elems)); compress_ = elems[0]; @@ -409,12 +408,12 @@ class KVStoreDistServer { TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); - // std::cout << "server: compress shape: " << dshape.Size() << std::endl; + std::cout << "server: compress shape: " << dshape.Size() << std::endl; NDArray comp_buf = compress_buf_[key]; if (compress_ != "none") { long int original_size = (long int)(*(recv_blob.dptr()+2)); dshape = TShape{original_size}; - //std::cout << "server: Uncompress shape: " << dshape.Size() << std::endl; + std::cout << "server: Uncompress shape: " << dshape.Size() << std::endl; if (comp_buf.is_none()) { comp_buf = NDArray(dshape, Context()); } diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index f653339ffd7f..8e2e156d352a 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -135,6 +135,13 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } + virtual void SetCompress(const std::string& compress, const float pos_threshold, + const float neg_threshold) { + compress_ = compress; + pos_threshold_ = pos_threshold; + neg_threshold_ = neg_threshold; + } + private: virtual void InitImpl(const std::vector& keys, const std::vector& values) { diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index b4f83ddc7cde..5921d24b1fda 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -553,10 +553,14 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, << "source operands have zero dimension shape"; // important: callback must always capture by value NDArray ret = *to; + NDArray res = *residual; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); std::vector const_vars; const_vars.push_back(from.var()); + std::vector mutable_vars; + mutable_vars.push_back(ret.var()); + mutable_vars.push_back(res.var()); std::vector inputs(5); inputs[0] = from.data(); @@ -569,7 +573,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), const_vars, {ret.var()}, + }, from.ctx(), const_vars, mutable_vars, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { LOG(FATAL) << "Unsupported dequantization"; @@ -580,7 +584,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), const_vars, {ret.var()}, + }, from.ctx(), const_vars, mutable_vars, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { LOG(FATAL) << "Unsupported dequantization"; @@ -615,7 +619,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { - LOG(FATAL) << "Unsupported dequantization"< Date: Sat, 23 Sep 2017 01:19:05 +0000 Subject: [PATCH 064/237] cleanup test --- tests/nightly/dist_sync_kvstore.py | 54 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 93d96b6fff3e..0a03ea2fc2bc 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -179,31 +179,33 @@ def test_compressed(): neg_threshold = -0.5 kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) # init kv dns keys - kv.init('99', mx.nd.ones(big_shape)) - kv.init('3', mx.nd.ones(shape)) - kv.push('99', mx.nd.ones(big_shape)*(kv.rank+1)) - # kv.push('99', mx.nd.ones(big_shape)*(kv.rank+1)) - # kv.push('99', mx.nd.ones(big_shape)*(kv.rank+1)) - - def check_default_keys(kv, my_rank, nworker): - - nrepeat = 2 - print 'nrepeat',nrepeat - for i in range(nrepeat): - kv.push('3', mx.nd.ones(shape)*(my_rank+1)) - kv.push('99', mx.nd.ones(big_shape)*(my_rank+1)) - - num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1 - val = mx.nd.zeros(shape) - kv.pull('3', out=val) - check_diff_to_scalar(val, pos_threshold) - - val2 = mx.nd.zeros(big_shape) - kv.pull('99', out=val2) - check_diff_to_scalar(val2, pos_threshold) - print val, val2 - # check_default_keys(kv, kv.rank, kv.num_workers) - + kv.init('99', mx.nd.zeros(big_shape)) + kv.init('3', mx.nd.zeros(shape)) + def verify_residual(pos_threshold): + kv.push('99', mx.nd.ones(big_shape)*0.1) + val=mx.nd.zeros(big_shape) + kv.pull('99',val) + check_diff_to_scalar(val, 0) + kv.push('99', mx.nd.ones(big_shape)*(pos_threshold-0.1)) + val2 = mx.nd.zeros(big_shape) + kv.pull('99',val2) + check_diff_to_scalar(val2, pos_threshold) + kv.push('99', mx.nd.ones(big_shape)*0.2) + val3= mx.nd.zeros(big_shape) + kv.pull('99', val3) + check_diff_to_scalar(val3, 0) + kv.push('99', mx.nd.ones(big_shape)*(pos_threshold-0.2)) + val4 = mx.nd.zeros(big_shape) + kv.pull('99',val4) + check_diff_to_scalar(val4, pos_threshold) + def check_zero(): + kv.push('99', mx.nd.zeros(big_shape)) + val = mx.nd.zeros(big_shape) + kv.pull('99', val) + check_diff_to_scalar(val, 0) + check_zero() + verify_residual(pos_threshold) + if __name__ == "__main__": - #test_sync_push_pull() + test_sync_push_pull() test_compressed() From 3f9256e10ba109309aec2a753a98c9203a38fad3 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 25 Sep 2017 00:37:50 +0000 Subject: [PATCH 065/237] debug prints --- src/kvstore/kvstore_dist.h | 15 ++++++++++----- src/kvstore/kvstore_dist_server.h | 4 ++++ src/operator/contrib/two_bit_quantize.cc | 13 +++++++++---- tests/nightly/dist_sync_kvstore.py | 6 +++--- tests/nightly/test.py | 9 +++++++++ 5 files changed, 35 insertions(+), 12 deletions(-) create mode 100644 tests/nightly/test.py diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 7fee417edcdd..833eb09dbaab 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -94,9 +94,9 @@ class KVStoreDist : public KVStoreLocal { virtual void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold) { KVStoreLocal::SetCompress(compress, pos_threshold, neg_threshold); - if (get_rank() == 0) { +// if (get_rank() == 0) { SendCommandToServers(kSetCompress, GetCompressParams()); - } +// } } void Barrier() override { @@ -271,7 +271,6 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devcies int key = uniq_keys[i]; @@ -322,14 +321,20 @@ class KVStoreDist : public KVStoreLocal { neg_thre_ = neg_threshold_; } } + std::cout<<"send_buf size is "<())<<" " + <<*(small_buf.data().dptr()+1)<<" " + <<*(small_buf.data().dptr()+2)< vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index d45608fd38c1..6bc0e62d3e57 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -412,6 +412,10 @@ class KVStoreDistServer { NDArray comp_buf = compress_buf_[key]; if (compress_ != "none") { long int original_size = (long int)(*(recv_blob.dptr()+2)); + long int original_size1 = (long int)(*(recv_blob.dptr()+1)); + long int original_size3 = (long int)(*(recv_blob.dptr()+3)); + long int original_size0 = (long int)(*(recv_blob.dptr()+0)); + std::cout<("FInferShape", Quantize2BitShape) .set_attr("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) +.set_attr("FMutateInputs", +[](const nnvm::NodeAttrs& attrs) { + return std::vector{2,4}; +}) .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("neg_shreshold", "NDArray-or-Symbol", "The negative shreshold") -.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold"); +.add_argument("neg_threshold", "NDArray-or-Symbol", "The negative shreshold") +.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold") +.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + NNVM_REGISTER_OP(_contrib_create_2bit) .describe(R"code(Tp generate a compressed array with right shape. diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 0a03ea2fc2bc..ac39653ac4ec 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -203,9 +203,9 @@ def check_zero(): val = mx.nd.zeros(big_shape) kv.pull('99', val) check_diff_to_scalar(val, 0) - check_zero() - verify_residual(pos_threshold) +# check_zero() + # verify_residual(pos_threshold) if __name__ == "__main__": - test_sync_push_pull() +# test_sync_push_pull() test_compressed() diff --git a/tests/nightly/test.py b/tests/nightly/test.py new file mode 100644 index 000000000000..9c507a612a07 --- /dev/null +++ b/tests/nightly/test.py @@ -0,0 +1,9 @@ +import mxnet as mx +grad = mx.nd.array([-6.3, -2.1, 3.4, 1.2, 10.5, 5.1, -3.2, 2.0, -8.9, 0]) +residual = mx.nd.array([-3.1, 1.2, -1.3, 5.4, -2.1, 2.9, 3.0, -7.0, -2.9, -100.3]) +neg_threshold = mx.nd.array([-4.0]) +pos_threshold = mx.nd.array([4.0]) +out = mx.contrib.nd.create_2bit(grad) +import pdb +#pdb.set_trace() +mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, out) From d6be11faea7725e8151c4fcf9ee62a20615ff883 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 25 Sep 2017 11:13:44 -0700 Subject: [PATCH 066/237] working kvstore dist. includes mutation of inputs and setting threshold array dtype properly --- src/kvstore/kvstore_dist.h | 14 ++++++-------- src/kvstore/kvstore_dist_server.h | 3 ++- src/kvstore/kvstore_local.h | 1 + src/operator/contrib/two_bit_quantize.cc | 11 ++++++++--- tests/nightly/dist_sync_kvstore.py | 5 ++++- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 7fee417edcdd..70430472b3a4 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -94,9 +94,9 @@ class KVStoreDist : public KVStoreLocal { virtual void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold) { KVStoreLocal::SetCompress(compress, pos_threshold, neg_threshold); - if (get_rank() == 0) { +// if (get_rank() == 0) { SendCommandToServers(kSetCompress, GetCompressParams()); - } +// } } void Barrier() override { @@ -271,7 +271,6 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devcies int key = uniq_keys[i]; @@ -314,11 +313,11 @@ class KVStoreDist : public KVStoreLocal { if (pos_thre_.is_none()) { // positive threshold pos_thre_ = NDArray(TShape{1}, send_buf.ctx(), - false, send_buf.dtype()); + false, mshadow::kFloat32); pos_thre_ = pos_threshold_; // negative threshold neg_thre_ = NDArray(TShape{1}, send_buf.ctx(), - false, send_buf.dtype()); + false, mshadow::kFloat32); neg_thre_ = neg_threshold_; } } @@ -328,9 +327,8 @@ class KVStoreDist : public KVStoreLocal { pos_thre_, neg_thre_, compress_, priority); + small_buf.WaitToRead(); } - std::cout<<"finished compress"< vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index d45608fd38c1..50bf24fa711e 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -146,9 +146,11 @@ class KVStoreDistServer { std::vector elems; split(params, ',', std::back_inserter(elems)); compress_ = elems[0]; + std::cout<<"Setting compress as "< 1) { pos_threshold_ = strtof(elems[1].c_str(), NULL); neg_threshold_ = strtof(elems[2].c_str(), NULL); + std::cout<<"Setting thresholds as "<()+2)); diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 8e2e156d352a..c01590f3025e 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -140,6 +140,7 @@ class KVStoreLocal : public KVStore { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; + std::cout<<"Set compress thresholds as "<("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) +.set_attr("FMutateInputs", +[](const nnvm::NodeAttrs& attrs) { + return std::vector{2,4}; +}) .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("neg_shreshold", "NDArray-or-Symbol", "The negative shreshold") -.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold"); +.add_argument("neg_threshold", "NDArray-or-Symbol", "The negative shreshold") +.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold") +.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + NNVM_REGISTER_OP(_contrib_create_2bit) .describe(R"code(Tp generate a compressed array with right shape. diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 0a03ea2fc2bc..815b966fa306 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -181,6 +181,7 @@ def test_compressed(): # init kv dns keys kv.init('99', mx.nd.zeros(big_shape)) kv.init('3', mx.nd.zeros(shape)) + def verify_residual(pos_threshold): kv.push('99', mx.nd.ones(big_shape)*0.1) val=mx.nd.zeros(big_shape) @@ -198,14 +199,16 @@ def verify_residual(pos_threshold): val4 = mx.nd.zeros(big_shape) kv.pull('99',val4) check_diff_to_scalar(val4, pos_threshold) + def check_zero(): kv.push('99', mx.nd.zeros(big_shape)) val = mx.nd.zeros(big_shape) kv.pull('99', val) check_diff_to_scalar(val, 0) + check_zero() verify_residual(pos_threshold) if __name__ == "__main__": - test_sync_push_pull() + #test_sync_push_pull() test_compressed() From e34d2638b97cd24e5bd5da9cc8a06aca1a34727b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 25 Sep 2017 11:38:24 -0700 Subject: [PATCH 067/237] fix operator --- src/kvstore/kvstore_dist_server.h | 7 +------ src/operator/contrib/two_bit_quantize.cc | 2 +- tests/python/unittest/test_operator.py | 8 ++++++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index d333e2cd78ae..1f540bab729d 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -146,11 +146,9 @@ class KVStoreDistServer { std::vector elems; split(params, ',', std::back_inserter(elems)); compress_ = elems[0]; - std::cout<<"Setting compress as "< 1) { pos_threshold_ = strtof(elems[1].c_str(), NULL); neg_threshold_ = strtof(elems[2].c_str(), NULL); - std::cout<<"Setting thresholds as "<()+2)); - long int original_size1 = (long int)(*(recv_blob.dptr()+1)); - long int original_size3 = (long int)(*(recv_blob.dptr()+3)); - long int original_size0 = (long int)(*(recv_blob.dptr()+0)); - std::cout<("FInferShape", Quantize2BitShape) .set_attr("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index ad3a532eb0b5..2c4136f38140 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3491,6 +3491,14 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) +def test_two_bit_quantization_op(): + grad = mx.nd.array([-6.3, -2.1, 3.4, 1.2, 10.5, 5.1, -3.2, 2.0, -8.9, 0]) + residual = mx.nd.array([-3.1, 1.2, -1.3, 5.4, -2.1, 2.9, 3.0, -7.0, -2.9, -100.3]) + neg_threshold = mx.nd.array([-4.0]) + pos_threshold = mx.nd.array([4.0]) + out = mx.contrib.nd.create_2bit(grad) + mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, out) + assert same(out, [ -4.00000000e+00 4.00000000e+00 1.00000000e+01 7.40468810e-39]) def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 From c0894b18e47416eb3c15c78239e5438d336c92ff Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 25 Sep 2017 18:36:35 -0700 Subject: [PATCH 068/237] kvstore dist changes --- dmlc-core | 2 +- include/mxnet/kvstore.h | 8 -- src/kvstore/kvstore_dist.h | 124 +++++++++++++------------ src/kvstore/kvstore_dist_server.h | 110 +++++++++++----------- src/kvstore/kvstore_local.h | 3 +- src/ndarray/ndarray.cc | 3 +- tests/nightly/dist_sync_kvstore.py | 74 +++++++++------ tests/nightly/test.py | 9 -- tests/python/unittest/test_operator.py | 32 ++++++- 9 files changed, 194 insertions(+), 171 deletions(-) delete mode 100644 tests/nightly/test.py diff --git a/dmlc-core b/dmlc-core index 5b520294be8d..a527100d7d50 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 5b520294be8def5c0372cf8c65a7089422f46653 +Subproject commit a527100d7d5001efc4954848a2fc6027e48c05f4 diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index b7bdbba122bd..a5c34f096958 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -70,14 +70,6 @@ class KVStore { virtual void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold) = 0; - std::string GetCompressParams() { - std::string rval = compress_; - if (compress_ == "2bit") { - rval += "," + std::to_string(pos_threshold_) + "," + std::to_string(neg_threshold_); - } - return rval; - } - /*! * \brief Initialize a list of key-value pair to the store. * diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index faf8d09dc56d..334e4290ddc7 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "./kvstore_local.h" #include "mxnet/engine.h" @@ -92,11 +93,11 @@ class KVStoreDist : public KVStoreLocal { } virtual void SetCompress(const std::string& compress, const float pos_threshold, - const float neg_threshold) { + const float neg_threshold) override { KVStoreLocal::SetCompress(compress, pos_threshold, neg_threshold); -// if (get_rank() == 0) { - SendCommandToServers(kSetCompress, GetCompressParams()); -// } + if (get_rank() == 0) { + SendCommandToServers(kSetCompress, compress_); + } } void Barrier() override { @@ -271,88 +272,93 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); + for (size_t i = 0; i < uniq_keys.size(); ++i) { - // merge over devcies + // merge over devices int key = uniq_keys[i]; const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; - - auto& send_buf = comm_buf_[key]; const auto storage_type = merged.storage_type(); + auto& comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { // make sure the previous push/pull is completed - send_buf.WaitToWrite(); - send_buf = merged; // avoid memory copy + comm_buf.WaitToWrite(); + comm_buf = merged; // avoid memory copy } else { - if (send_buf.is_none()) { + if (comm_buf.is_none()) { if (storage_type == kDefaultStorage) { - send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); + comm_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype()); } else { - send_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); + comm_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype()); } } - CopyFromTo(merged, &send_buf); + CopyFromTo(merged, &comm_buf); } auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; - // Init the small buffer and residual_ buffer for quantize - if (small_buf.is_none() && compress_ != "none") { - int bits = compress_ == "2bit" ? 16 : 32; - long int small_size = merged.shape().Size() % bits == 0 ? - merged.shape().Size() / bits + 3 : - merged.shape().Size() / bits + 4; - // small buffer for quantize - small_buf = NDArray(TShape{small_size}, - send_buf.ctx(), false, send_buf.dtype()); - // residual buffer for quantize - res_buf = NDArray(merged.shape(), send_buf.ctx(), - false, send_buf.dtype()); - res_buf = 0; - if (pos_thre_.is_none()) { - // positive threshold - pos_thre_ = NDArray(TShape{1}, send_buf.ctx(), - false, mshadow::kFloat32); - pos_thre_ = pos_threshold_; - // negative threshold - neg_thre_ = NDArray(TShape{1}, send_buf.ctx(), - false, mshadow::kFloat32); - neg_thre_ = neg_threshold_; + // Compress + if (compress_ != "none") { + // Init the small buffer and residual_ buffer for quantize + if (small_buf.is_none()) { + int bits = compress_ == "2bit" ? 16 : 32; + long int small_size = merged.shape().Size() % bits == 0 ? + merged.shape().Size() / bits + 3 : + merged.shape().Size() / bits + 4; + // small buffer for quantize + small_buf = NDArray(TShape{small_size}, comm_buf.ctx(), false, comm_buf.dtype()); + // residual buffer for quantize + res_buf = NDArray(merged.shape(), comm_buf.ctx(), false, comm_buf.dtype()); + res_buf = 0; + if (pos_thre_.is_none()) { + // positive threshold + pos_thre_ = NDArray(TShape{1}, comm_buf.ctx(), false, mshadow::kFloat32); + pos_thre_ = pos_threshold_; + // negative threshold + neg_thre_ = NDArray(TShape{1}, comm_buf.ctx(), false, mshadow::kFloat32); + neg_thre_ = neg_threshold_; + } + } + + if (compress_ == "2bit") { + Quantize(comm_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, priority); +// small_buf.WaitToRead(); +// std::cout<<"Original data is "<<*((float *) comm_buf.data().dptr_)< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); +// std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " +// << *(((float *) small_buf.data().dptr_)+1) << " " +// << *(((float *) small_buf.data().dptr_)+2) << " " +// << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; +// std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) <(); #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); + mkl_set_tblob_eager_mode(send_buf.data()); #endif - real_t* data = nullptr; - if (compress_ == "none") { - data = send_buf.data().dptr(); - } else { - data = small_buf.data().dptr(); - } + PSKV& pskv = EncodeKey(key, size, true); // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 1f540bab729d..a998069f35db 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -31,8 +31,7 @@ #include #include #include -#include -#include +#include #include "ps/ps.h" #include "mxnet/kvstore.h" #include "../operator/tensor/elemwise_binary_op.h" @@ -48,16 +47,6 @@ static const int kStopServer = -1; static const int kSyncMode = -2; static const int kSetCompress = 2; -template -void split(const std::string &s, char delim, Out result) { - std::stringstream ss; - ss.str(s); - std::string item; - while (std::getline(ss, item, delim)) { - *(result++) = item; - } -} - /** * \brief executor runs a function using the thread called \ref Start */ @@ -142,16 +131,6 @@ class KVStoreDistServer { controller_ = controller; } - void load_compress_params(const std::string& params) { - std::vector elems; - split(params, ',', std::back_inserter(elems)); - compress_ = elems[0]; - if (elems.size() > 1) { - pos_threshold_ = strtof(elems[1].c_str(), NULL); - neg_threshold_ = strtof(elems[2].c_str(), NULL); - } - } - void set_updater(const KVStore::Updater& updater) { CHECK(updater); updater_ = updater; @@ -176,7 +155,7 @@ class KVStoreDistServer { } else if (recved.head == kSyncMode) { sync_mode_ = true; } else if (recved.head == kSetCompress) { - load_compress_params(recved.body); + compress_ = recved.body; } else { // let the main thread to execute ctrl, which is necessary for python exec_.Exec([this, recved]() { @@ -312,14 +291,14 @@ class KVStoreDistServer { // instead of calling BinaryComputeRspRsp directly using namespace mshadow; Engine::Get()->PushSync([recved, merged, out](RunContext ctx) { - std::vector inputs, outputs; - inputs.push_back(recved); - inputs.push_back(merged.array); - outputs.push_back(out); - op::ElemwiseBinaryOp::ComputeEx( - {}, {}, inputs, {kWriteTo}, outputs); - }, recved.ctx(), const_vars, {out.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + std::vector inputs, outputs; + inputs.push_back(recved); + inputs.push_back(merged.array); + outputs.push_back(out); + op::ElemwiseBinaryOp::ComputeEx( + {}, {}, inputs, {kWriteTo}, outputs); + }, recved.ctx(), const_vars, {out.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); CopyFromTo(out, &merged.array, 0); } merged.request.push_back(req_meta); @@ -387,7 +366,6 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - CHECK_EQ(req_meta.cmd, kDefaultPushPull); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); @@ -408,14 +386,16 @@ class KVStoreDistServer { TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); - NDArray comp_buf = compress_buf_[key]; - recved.WaitToRead(); + NDArray decomp_buf = decomp_buf_[key]; if (compress_ != "none") { +// float neg = (*(recv_blob.dptr())); +// float pos = (*(recv_blob.dptr()+1)); long int original_size = (long int)(*(recv_blob.dptr()+2)); +// std::bitset foo(*reinterpret_cast(recv_blob.dptr()+3)); +// std::cout<<"Server received"<Response(req_meta); stored.WaitToRead(); +// std::cout<<"stored is "<<*((float *)stored.data().dptr_)<<" " <<*((float *)stored.data().dptr_+1)<<" " <<*((float *)stored.data().dptr_+2)<Response(req_meta); stored.WaitToRead(); } @@ -479,15 +470,29 @@ class KVStoreDistServer { } /** - * \brief user defined + * \brief user defined mode for push */ bool sync_mode_; KVStore::Controller controller_; KVStore::Updater updater_; + /** + * \brief store_ contains the value at kvstore for each key + */ std::unordered_map store_; + + /** + * \brief merge_buf_ is a buffer used if sync_mode is true. It represents + * values from different workers being merged. The store will be updated + * to this value when values from all workers are pushed into this buffer. + */ std::unordered_map merge_buf_; - std::unordered_map compress_buf_; + + /** + * \brief decomp_buf_ is a buffer into which compressed values are + * decompressed before merging to the store + */ + std::unordered_map decomp_buf_; Executor exec_; ps::KVServer* ps_server_; @@ -495,18 +500,11 @@ class KVStoreDistServer { // whether to LOG verbose information bool log_verbose_; - // set to use gradient compression - std::string compress_; - - /** - * \brief positive threshold for 2bit compression - */ - float pos_threshold_ = 0.1; - /** - * \brief negative threshold for 2bit compression + * \brief compress_ refers to whether values sent to kvstore server are + * in quantized form. It can be 'none' or '2bit' for now. */ - float neg_threshold_ = -0.1; + std::string compress_ = "none"; }; diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index c01590f3025e..f454276fc586 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -136,11 +136,10 @@ class KVStoreLocal : public KVStore { } virtual void SetCompress(const std::string& compress, const float pos_threshold, - const float neg_threshold) { + const float neg_threshold) override { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; - std::cout<<"Set compress thresholds as "<ctx().dev_mask(); std::vector const_vars; const_vars.push_back(from.var()); + const_vars.push_back(pos_threshold.var()); + const_vars.push_back(neg_threshold.var()); std::vector mutable_vars; mutable_vars.push_back(ret.var()); mutable_vars.push_back(res.var()); @@ -611,7 +613,6 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri std::vector inputs(2); inputs[0] = from.data(); inputs[1] = to->data(); - if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 815b966fa306..d2c8784692de 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -174,41 +174,55 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): def test_compressed(): - kv = mx.kv.create('dist_sync') - pos_threshold = 0.5 - neg_threshold = -0.5 - kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) - # init kv dns keys - kv.init('99', mx.nd.zeros(big_shape)) - kv.init('3', mx.nd.zeros(shape)) - - def verify_residual(pos_threshold): - kv.push('99', mx.nd.ones(big_shape)*0.1) - val=mx.nd.zeros(big_shape) - kv.pull('99',val) - check_diff_to_scalar(val, 0) - kv.push('99', mx.nd.ones(big_shape)*(pos_threshold-0.1)) - val2 = mx.nd.zeros(big_shape) - kv.pull('99',val2) - check_diff_to_scalar(val2, pos_threshold) - kv.push('99', mx.nd.ones(big_shape)*0.2) - val3= mx.nd.zeros(big_shape) - kv.pull('99', val3) - check_diff_to_scalar(val3, 0) - kv.push('99', mx.nd.ones(big_shape)*(pos_threshold-0.2)) - val4 = mx.nd.zeros(big_shape) - kv.pull('99',val4) - check_diff_to_scalar(val4, pos_threshold) - - def check_zero(): + def init_kv(type): + kv = mx.kv.create(type) + pos_threshold = 0.5 + neg_threshold = -0.5 + kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) + # init kv dns keys + kv.init('99', mx.nd.zeros(big_shape)) + kv.init('3', mx.nd.zeros(shape)) + my_rank = kv.rank + nworker = kv.num_workers + return kv, pos_threshold, neg_threshold, my_rank, nworker + + def verify_residual(kv, pos_threshold): + for d in [('99', big_shape), ('3', shape)]: + kv.push(d[0], mx.nd.ones(d[1])*0.4) + val=mx.nd.zeros(d[1]) + kv.pull(d[0],val) + check_diff_to_scalar(val, 0) + kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold - 0.4)) + val2 = mx.nd.zeros(d[1]) + kv.pull(d[0],val2) + check_diff_to_scalar(val2, pos_threshold) + kv.push(d[0], mx.nd.ones(d[1])*0.2) + val3= mx.nd.zeros(d[1]) + kv.pull(d[0], val3) + check_diff_to_scalar(val3, 0) + kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold-0.2)) + val4 = mx.nd.zeros(d[1]) + kv.pull(d[0],val4) + check_diff_to_scalar(val4, pos_threshold) + + def check_ones(kv, pos): + kv.push('99',mx.nd.ones(big_shape)*pos*2) + val = mx.nd.zeros(big_shape) + kv.pull('99', val) + check_diff_to_scalar(val, pos) + + def check_zero(kv): kv.push('99', mx.nd.zeros(big_shape)) val = mx.nd.zeros(big_shape) kv.pull('99', val) check_diff_to_scalar(val, 0) - check_zero() - verify_residual(pos_threshold) + for type in ['dist_sync']: + kv, pos, neg, rank, nworker = init_kv(type) + check_zero(kv) + verify_residual(kv, pos) + check_ones(kv, pos) if __name__ == "__main__": - #test_sync_push_pull() + test_sync_push_pull() test_compressed() diff --git a/tests/nightly/test.py b/tests/nightly/test.py deleted file mode 100644 index 9c507a612a07..000000000000 --- a/tests/nightly/test.py +++ /dev/null @@ -1,9 +0,0 @@ -import mxnet as mx -grad = mx.nd.array([-6.3, -2.1, 3.4, 1.2, 10.5, 5.1, -3.2, 2.0, -8.9, 0]) -residual = mx.nd.array([-3.1, 1.2, -1.3, 5.4, -2.1, 2.9, 3.0, -7.0, -2.9, -100.3]) -neg_threshold = mx.nd.array([-4.0]) -pos_threshold = mx.nd.array([4.0]) -out = mx.contrib.nd.create_2bit(grad) -import pdb -#pdb.set_trace() -mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, out) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 2c4136f38140..ca59140f432f 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3492,13 +3492,35 @@ def test_quantization_op(): assert same(a_.asnumpy(), a_real.asnumpy()) def test_two_bit_quantization_op(): - grad = mx.nd.array([-6.3, -2.1, 3.4, 1.2, 10.5, 5.1, -3.2, 2.0, -8.9, 0]) - residual = mx.nd.array([-3.1, 1.2, -1.3, 5.4, -2.1, 2.9, 3.0, -7.0, -2.9, -100.3]) neg_threshold = mx.nd.array([-4.0]) pos_threshold = mx.nd.array([4.0]) - out = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, out) - assert same(out, [ -4.00000000e+00 4.00000000e+00 1.00000000e+01 7.40468810e-39]) + + grad = mx.nd.array([1.0, 1.0, 1.0]) + residual = mx.nd.array([0.0, 0.0, 0.0]) + compr = mx.contrib.nd.create_2bit(grad) + mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + decompr = mx.nd.zeros(grad.shape) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.zeros(grad.shape), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + + grad = mx.nd.array([3.0, 3.0, 3.0]) + mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([0.0, 0.0, 0.0])) + + grad = mx.nd.array([1.0, 1.0, 1.0]) + mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.zeros(grad.shape), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + + grad = mx.nd.array([6.0, 6.0, 6.0]) + mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([3.0, 3.0, 3.0])) def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 From 381941ef60f079f29ae01decd1516d31d535e76a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 26 Sep 2017 09:36:56 -0700 Subject: [PATCH 069/237] fix compress kvstore issues. non compress is broken --- src/kvstore/kvstore_dist.h | 2 +- tests/nightly/dist_sync_kvstore.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 334e4290ddc7..8a9464aa3219 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -272,7 +272,7 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - + std::cout<shape().Size()< Date: Tue, 26 Sep 2017 16:59:55 -0700 Subject: [PATCH 070/237] fix sparse push issue --- src/kvstore/kvstore_dist.h | 69 ++++++++++++++---------------- tests/nightly/dist_sync_kvstore.py | 7 ++- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 8a9464aa3219..0d8db7bcb812 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -272,7 +272,6 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - std::cout<shape().Size()<(); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif - PSKV& pskv = EncodeKey(key, size, true); - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); - }; - Engine::Get()->PushAsync( - push_to_servers, - pinned_ctx_, - {send_buf.var()}, - {}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistDefaultPush")); + if (compress_ == "none") { + PushDefault(key, comm_buf, priority); + } else { + PushDefault(key, small_buf, priority); + } } else if (storage_type == kRowSparseStorage) { - PushRowSparse(key, send_buf, priority); + PushRowSparse(key, comm_buf, priority); } else { LOG(FATAL) << "unknown storage type"; } } } + void PushDefault(int key, NDArray &send_buf, int priority){ + auto push_to_servers = + [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + // convert to ps keys + size_t size = 0; + real_t* data = nullptr; + size = send_buf.shape().Size(); + data = send_buf.data().dptr(); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); +#endif + PSKV& pskv = EncodeKey(key, size, true); + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {send_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistDefaultPush")); + } + // pull row sparse weight into `recv_buf` based on indices given by `indices` void PullRowSparse_(const int key, NDArray *recv_buf, const NDArray& indices, int priority) { using namespace rowsparse; diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index f455df3cf282..a72eaaa68d8e 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -167,12 +167,11 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): check_diff_to_scalar(val, expected, rank=my_rank) check_default_keys(kv, my_rank, nworker) - # check_row_sparse_keys(kv, my_rank, nworker) - # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) - # check_big_row_sparse_keys(kv, my_rank, nworker) + check_row_sparse_keys(kv, my_rank, nworker) + check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + check_big_row_sparse_keys(kv, my_rank, nworker) print('worker ' + str(my_rank) + ' is done') - def test_compressed(): def init_kv(type): kv = mx.kv.create(type) From c0dc329910ed9cc5ae454625095359d9936827b2 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 27 Sep 2017 19:24:36 -0700 Subject: [PATCH 071/237] fix read lock issue --- nnvm | 2 +- src/kvstore/kvstore_dist.h | 48 +++++++++++-------- src/kvstore/kvstore_dist_server.h | 12 +++-- tests/nightly/dist_sync_kvstore.py | 77 ++++++++++++++---------------- 4 files changed, 75 insertions(+), 64 deletions(-) diff --git a/nnvm b/nnvm index e842c098decf..65a1a7104f8d 160000 --- a/nnvm +++ b/nnvm @@ -1 +1 @@ -Subproject commit e842c098decf9f5eb6bd84e307c58e50078596b7 +Subproject commit 65a1a7104f8dca986c57765012555172239b31b1 diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 0d8db7bcb812..dd3f135d08dd 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -322,14 +322,15 @@ class KVStoreDist : public KVStoreLocal { if (compress_ == "2bit") { Quantize(comm_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, priority); -// small_buf.WaitToRead(); -// std::cout<<"Original data is "<<*((float *) comm_buf.data().dptr_)< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); -// std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " -// << *(((float *) small_buf.data().dptr_)+1) << " " -// << *(((float *) small_buf.data().dptr_)+2) << " " -// << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; -// std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) < foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); + std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " + << *(((float *) small_buf.data().dptr_)+1) << " " + << *(((float *) small_buf.data().dptr_)+2) << " " + << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; + std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) < const_vars; + auto& comm_buf = comm_buf_[key]; + const_vars.push_back(comm_buf.var()); + if (compress_ != "none") { + //if compress is set, then send_buf is different from comm_buf + const_vars.push_back(send_buf.var()); + } auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { - // convert to ps keys - size_t size = 0; - real_t* data = nullptr; - size = send_buf.shape().Size(); - data = send_buf.data().dptr(); + // convert to ps keys + size_t size = 0; + real_t* data = nullptr; + size = send_buf.shape().Size(); + data = send_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); + mkl_set_tblob_eager_mode(send_buf.data()); #endif - PSKV& pskv = EncodeKey(key, size, true); - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + PSKV& pskv = EncodeKey(key, size, true); + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, pinned_ctx_, - {send_buf.var()}, + const_vars, {}, FnProperty::kNormal, priority, diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index a998069f35db..135d09cb2cee 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -155,6 +155,7 @@ class KVStoreDistServer { } else if (recved.head == kSyncMode) { sync_mode_ = true; } else if (recved.head == kSetCompress) { + std::cout<<"Setting compress"<* server) { if (merged->request.size() == (size_t) ps::NumWorkers()) { + std::cout<<"merged buf should be cleared now as size = "<array.data().dptr_)<Response(req); } merged->request.clear(); + std::cout<<"size of request buf"<request.size()<WaitToRead(); } else { merged->array.WaitToRead(); @@ -376,7 +381,7 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[0]); auto& stored = store_[key]; - + std::cout<<"For key"<Response(req_meta); stored.WaitToRead(); -// std::cout<<"stored is "<<*((float *)stored.data().dptr_)<<" " <<*((float *)stored.data().dptr_+1)<<" " <<*((float *)stored.data().dptr_+2)< response; CHECK(!stored.is_none()) << "init " << key << " first"; auto len = stored.shape().Size(); + std::cout<<"pull stored is "<<*((float *)stored.data().dptr_)<<" " <<*((float *)stored.data().dptr_+1)<<" " <<*((float *)stored.data().dptr_+2)< Date: Thu, 28 Sep 2017 06:37:23 +0000 Subject: [PATCH 072/237] optimizer is the only issue now? --- src/kvstore/.nfsefebc5260d811ddf0000001b | Bin 0 -> 16384 bytes src/kvstore/kvstore_dist.h | 5 ++- src/kvstore/kvstore_dist_server.h | 9 ++++-- tests/nightly/dist_sync_kvstore.py | 37 ++++++++++++----------- 4 files changed, 27 insertions(+), 24 deletions(-) create mode 100644 src/kvstore/.nfsefebc5260d811ddf0000001b diff --git a/src/kvstore/.nfsefebc5260d811ddf0000001b b/src/kvstore/.nfsefebc5260d811ddf0000001b new file mode 100644 index 0000000000000000000000000000000000000000..99035685faf8ad3ceb66ff3457274b938f054e0b GIT binary patch literal 16384 zcmeHNON<;>6)p3#i4zlyUUDQz z>X-UVdx&gH;UhkZkPv}IWL9igMPh-KRaQWVU=f+$0tuu@AcSBMA(D_7IQP}7{&~ih zl~qKQbX48%bL-yo?z^wvebp+h*Uquyg(nqUk1EPPwO?K=9W2ih&3wMJTkGqwz#?x4#g1My9X)Dkq3ciCR~;S}179z0ZUrdd$z3(f zAS`ru6PT8#45SP^00WOGM`xzR{5+gF#6ER!?E(6xO;ZL^22uu622uu622uu622uu6 z20rKvgx+4|28{TSG{lzoJ!AL(%Bv&Gow54=$oh-2{5NCucVztqSzeG2NS^batiK}b zzcyC?uB>m#`Zvbv|1Ikavc4(}O61Gp^p-M^GLSNmGLSNmGLSNmGLSNmGLSNmGLSOx zf60JmD9Q;~zMuBXIR8)f|93v2D6arFfJI;*aQlFwyZ~$fP2e%$5b(w$it@Xj?1i4dkAWM&2Jji+ z0I(0Z^)bi>T7U*D0*?dxfxmuKQGN^D0-gsh1FOIn0S4>^_5gQ2qA2eHKLc(8Uj|ly zIUoyUfX@M+1z!8GqI?@T3rqoTV_e<@{sH_QcpZ2V_&)GGzyT%z8o!5t_i-^?^JlB` zOKN?kSzlXT&U%5`bNx;3tGN>jRquA`t`meT2#s<%2z}G(9AUa`s4dsn&M|k3`@U&# z_N>C#mTMZU!5w4XwQbEYnr?&pXb@ygClsAbBM_$H*?Hg#SupW$IP zH+5=GgGBB_!E$+XL#TMhcOy@zDISru7sm&hIlMFYyBok|X3@wB^u#-&wjQ!#o<5mz4 zFpb)%7X@N~RSa7I-ekY5yi1#j)mjj{LftiZHkXhpWc6FK*;$q`oh{9R8%Yrv(m@RR zu>9EOwOHXJN!xq7gTXBxayVV$_KDY%pDmgWp86%8ZN<)~ws;$(t@5j8m^eq`wnRe0 z8U5bT;WyzeUTkunWilh9Uo2uaxgHCc0Mc1!bRyLk=()X^*Nw9zqw=g}P9r%#tc2rFCmK$*L zAH%kEF`ui>RyK3)LcQ8l(YLv>T2&Wnji$P~vY|H4S5_NUb-lVex6-I)@~o}F&m>bV zvxAY)wh?9`$GD%&x!Re^>T2bpI=8Y`Ux29TQmMoy#7zoJMl87#gX0l~O~ys>xQU?C zFVef*h%D|8jpTjemqL$-NAb`uP0c{{xlS>3doU>&D;9fQ_(NCoJnjU{h$!l^q~#YB zAy=r{2sJ;f1lx|D&4>@}E|&(rHgI(eiNkxM$I^Q20JA>VEHylpElG=Z_YTI&BjGO_ zVU!KNg`lk>_}Mu`zW%JrI+100p^w7YpVfYea6^98vrOF#6N$o?TCQucaj6RPq88eg z72VQ;V7LPmitCa>5td2*L=j-|!WI;u0%ryfHPZ^1>#)l{Z)3jFnsgTHlLeO-3{oF6 zd?FB2GFF>8h*T(f+HJAW#Y*9XK9r<6i!iBCL4aw%9etZ=wu|x6EDLf-6dE~$mL3S< z1~UV+=xN)5kfoUpjVXz6yu{?B+Z6YhEQ=r+LZE`%@ z#~<#SB77MI=Agpb20fxXgO$|YK>=p-(OjP7A+#|q5lD9CVSS=qC(^L|g`5)`@Isg+ zqzO?Z1(pO63?fyf!IxWDvD=J3==9B26rxss?5PwdCh#Hyr2q5UD28fKCL9+f zhw5d`7QBmI2cI4dF%S8+xtts^7S@hm9QaMXZnBg{=APJjonIV?R;S$a2xV{;7?cu^Lu7lFylK90A zB?fO}C19f=dn9z9Rf#hP4e!%R&c)NQZ;UvHX?qs8xf8}t-_inj3_RNlreG2$A3fbR zEsj7Z9VYo$Sh$mvjD>;wVm2%z6gdGGlO^!DPVYS-GSgny^bCpJ| zk!KsV=Gm3CCflHesotzr8*F8j&9BrKYR%e89sd?trGAlm25vj-(L5fWKX(~N0c5xV z=+YeOzUEk@r_Y~>OdmY95EMKLvRVraT3U}x&^kUx9U__OupT@D(T8FI31F-nhF%$~ zPOn$cFipF>;QE~+L=dY%ak(~Mtv9MuGli1u(gR1sM1Y!$Zp3VDo4GB7Tj+)`=iy&_ zxEbDZ=z$qIURbKww(RD(A*}#t4Lily^Q$`dLeiJsuq{(_bU`$1piTpVDnex@h345h zb{rH-p)SkH(eXLnCkzU?q?T`Xx*>CW4qA3i585{oQ!9-{ctmXTKo3%Qn0INqjL2~; zH4G5y;uR9wBDmVZt*IlJM-bv7$e@_%rtUF9Y8N zY=8sjfHS}`UYfR}(D0p9|?3TVJ3paPr*4g!BeKHwJcJfH$6 zfFr;sfk%ONq37GcTfiTIUjVNHKL^ML${nOv%0S9M%0S9M%0S9M%0S9M%D@McfjHek z*}nU1q!s0wXf>8GxvMluxhU5tBP?=i!+BYp1Tt1%s8Cw9f7GB-)1)vtArOT{=q2+y z{WA@*+j%BJdI-64bQFg|k>mQ{Cl9Tgr vI$J}!*dMV_j3rbaRNYJML{mCUjY&Z3ES>@13joKXG;TltX- literal 0 HcmV?d00001 diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index dd3f135d08dd..5ece2c0a15a0 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -296,7 +296,6 @@ class KVStoreDist : public KVStoreLocal { auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; - // Compress if (compress_ != "none") { // Init the small buffer and residual_ buffer for quantize @@ -354,10 +353,10 @@ class KVStoreDist : public KVStoreLocal { void PushDefault(int key, NDArray &send_buf, int priority){ std::vector const_vars; auto& comm_buf = comm_buf_[key]; - const_vars.push_back(comm_buf.var()); + const_vars.push_back(send_buf.var()); if (compress_ != "none") { //if compress is set, then send_buf is different from comm_buf - const_vars.push_back(send_buf.var()); + const_vars.push_back(comm_buf.var()); } auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 135d09cb2cee..1f31808a5c6a 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -182,7 +182,7 @@ class KVStoreDistServer { ps::KVServer* server) { if (merged->request.size() == (size_t) ps::NumWorkers()) { std::cout<<"merged buf should be cleared now as size = "<array.data().dptr_)<array.data().dptr_)<Response(req); } merged->request.clear(); - std::cout<<"size of request buf"<request.size()<request.size()<WaitToRead(); + //std::cout<<"stored now has "<<*((float *)stored->data().dptr_)<array.WaitToRead(); } @@ -424,6 +424,8 @@ class KVStoreDistServer { if (compress_ == "none") { CopyFromTo(recved, &merged.array, 0); } else { + std::cout<<"Should be this case"< Date: Thu, 28 Sep 2017 18:46:22 +0000 Subject: [PATCH 073/237] fix all issues with gc dist --- src/kvstore/kvstore_dist.h | 21 +++++++++++---------- src/kvstore/kvstore_dist_server.h | 24 +++++------------------- tests/nightly/dist_sync_kvstore.py | 25 ++++++++++++++----------- 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 5ece2c0a15a0..280451283f4b 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -272,12 +272,14 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); + for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); + auto& comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { // make sure the previous push/pull is completed @@ -296,7 +298,6 @@ class KVStoreDist : public KVStoreLocal { auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; - // Compress if (compress_ != "none") { // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { @@ -321,15 +322,15 @@ class KVStoreDist : public KVStoreLocal { if (compress_ == "2bit") { Quantize(comm_buf, &small_buf, &res_buf, pos_thre_, neg_thre_, compress_, priority); - small_buf.WaitToRead(); - res_buf.WaitToRead(); - std::cout<<"Original data is "<<*((float *) comm_buf.data().dptr_)< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); - std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " - << *(((float *) small_buf.data().dptr_)+1) << " " - << *(((float *) small_buf.data().dptr_)+2) << " " - << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; - std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) < foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); + //std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " + // << *(((float *) small_buf.data().dptr_)+1) << " " + // << *(((float *) small_buf.data().dptr_)+2) << " " + // << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; + //std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) <* server) { if (merged->request.size() == (size_t) ps::NumWorkers()) { - std::cout<<"merged buf should be cleared now as size = "<array.data().dptr_)<array, stored); + updater_(key, merged->array, stored); }); } else { // if no updater, just copy @@ -200,9 +198,7 @@ class KVStoreDistServer { server->Response(req); } merged->request.clear(); - //std::cout<<"size of request buf"<request.size()<WaitToRead(); - //std::cout<<"stored now has "<<*((float *)stored->data().dptr_)<array.WaitToRead(); } @@ -381,7 +377,7 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[0]); auto& stored = store_[key]; - std::cout<<"For key"<())); -// float pos = (*(recv_blob.dptr()+1)); long int original_size = (long int)(*(recv_blob.dptr()+2)); -// std::bitset foo(*reinterpret_cast(recv_blob.dptr()+3)); -// std::cout<<"Server received"<Response(req_meta); stored.WaitToRead(); - std::cout<<"stored is inited to "<<*((float *)stored.data().dptr_)<<" " <<*((float *)stored.data().dptr_+1)<<" " <<*((float *)stored.data().dptr_+2)< response; CHECK(!stored.is_none()) << "init " << key << " first"; auto len = stored.shape().Size(); - std::cout<<"pull stored is "<<*((float *)stored.data().dptr_)<<" " <<*((float *)stored.data().dptr_+1)<<" " <<*((float *)stored.data().dptr_+2)< Date: Thu, 28 Sep 2017 16:15:51 -0700 Subject: [PATCH 074/237] fix read lock issue --- tests/nightly/dist_sync_kvstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 6d8a2cc95330..98ac0695dde4 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -203,7 +203,7 @@ def check_ones(kv, pos): kv.push('221',mx.nd.ones(big_shape)*pos*4) val = mx.nd.zeros(big_shape) kv.pull('221', val) - curval = pos*2*3 + curval = pos*2*3 check_diff_to_scalar(val, curval) def check_zero(kv): From dbcec872ca8af0b0512ca3af62820bc90b092f24 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 2 Oct 2017 12:08:00 -0700 Subject: [PATCH 075/237] pushing sharded data works --- src/kvstore/kvstore_dist.h | 169 +++++++++++++++++++++++++++--- src/kvstore/kvstore_dist_server.h | 6 +- 2 files changed, 161 insertions(+), 14 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 280451283f4b..fd2487b2edca 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -98,6 +98,9 @@ class KVStoreDist : public KVStoreLocal { if (get_rank() == 0) { SendCommandToServers(kSetCompress, compress_); } + //this fails. everyone just waits. why? +// Barrier(); +// ps::Postoffice::Get()->Barrier(ps::kWorkerGroup + ps::kServerGroup); } void Barrier() override { @@ -196,7 +199,8 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size, false); + bool is_compressed = (compress_!="none"); + PSKV& pskv = EncodeKey(key, size, false, is_compressed); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -295,7 +299,6 @@ class KVStoreDist : public KVStoreLocal { } CopyFromTo(merged, &comm_buf); } - auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; if (compress_ != "none") { @@ -335,13 +338,12 @@ class KVStoreDist : public KVStoreLocal { LOG(FATAL) << "Unsupported quantization"; } } - // push to servers if (storage_type == kDefaultStorage) { if (compress_ == "none") { PushDefault(key, comm_buf, priority); } else { - PushDefault(key, small_buf, priority); + PushCompressed(key, comm_buf, small_buf, priority); } } else if (storage_type == kRowSparseStorage) { PushRowSparse(key, comm_buf, priority); @@ -351,14 +353,75 @@ class KVStoreDist : public KVStoreLocal { } } - void PushDefault(int key, NDArray &send_buf, int priority){ + + void PushCompressed(int key, NDArray& comm_buf, NDArray &send_buf, int priority){ + auto& comm_small_send_buf = comm_small_send_buf_[key]; + PSKV& pskv = EncodeCompressedKey(key, send_buf.shape().Size(), true); std::vector const_vars; - auto& comm_buf = comm_buf_[key]; - const_vars.push_back(send_buf.var()); - if (compress_ != "none") { + const_vars.push_back(comm_buf.var()); + if (comm_buf.shape().Size() > bigarray_bound_) { + if (comm_small_send_buf.is_none()) { + comm_small_send_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); + } + size_t prev_from = 3; + size_t prev_to = 0; + int cursize = 0; + int original_size = comm_buf.shape().Size(); + CHECK_GT(original_size,0); + NDArray meta = send_buf.Slice(0,3); + for(size_t i = 0; i bigarray_bound_ ){ + data = comm_small_send_buf.data().dptr(); + #if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(comm_small_send_buf.data()); + #endif + size = comm_small_send_buf.shape().Size(); + } else { + data = send_buf.data().dptr(); + #if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); + #endif + size = send_buf.shape().Size(); + } + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + const_vars, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistCompressedPush")); + } + + void PushDefault(int key, NDArray &send_buf, int priority){ auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys @@ -378,7 +441,7 @@ class KVStoreDist : public KVStoreLocal { Engine::Get()->PushAsync( push_to_servers, pinned_ctx_, - const_vars, + {send_buf.var()}, {}, FnProperty::kNormal, priority, @@ -492,6 +555,88 @@ class KVStoreDist : public KVStoreLocal { */ std::mutex mu_; + size_t roundUp(size_t numToRound, size_t multiple) + { + assert(multiple && ((multiple & (multiple - 1)) == 0)); + return (numToRound + multiple - 1) & -multiple; + } + + + PSKV& EncodeKey(int key, size_t size, bool is_push, bool is_compressed) { + if (is_compressed) { + EncodeCompressedKey(key, size, is_push); + } else { + EncodeKey(key, size, is_push); + } + } + + /** + * \brief convert to keys in ps for compressed values + * \brief buf_size will be size of recv_buf (original size) if pull + * buf_size will be size of quantized array if push. Actual size of + * send_buf in this case will add few counts of meta information + * to each part if divided + */ + inline PSKV& EncodeCompressedKey(int key, size_t buf_size, bool is_push) { + size_t original_size = comm_buf_[key].shape().Size(); + size_t size = (is_push) ? buf_size : original_size; + mu_.lock(); + PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; + mu_.unlock(); + if (!pskv.keys.empty()) { + //will fail +// CHECK_EQ(static_cast(pskv.size), size)<< "The value size cannot be changed"; + } else { + auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); + // a simple heuristic for load balance + if (original_size < bigarray_bound_) { + // send it to a single random picked server + int server = (key * 9973) % num_servers; + ps::Key ps_key = krs[server].begin() + key; + CHECK_LT(ps_key, krs[server].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(size); + pskv.size = size; + } else { + // partition it to all servers + pskv.size = 0; + size_t final_size; + if (is_push) { + final_size = buf_size+3*(num_servers-1); + for (int i = 0; i < num_servers; ++i) { + //if pushing, divide size of compressed array into blocks of 16, so we don't split between a compressed value + //if pulling, need to divide exact way as above did + size_t part_size = is_push? (roundUp((size-3)/num_servers*(i+1), 16) - roundUp((size-3)/num_servers*(i), 16) + 3) + : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); + ps::Key ps_key = krs[i].begin() + key; + CHECK_LT(ps_key, krs[i].end()); + pskv.keys.push_back(ps_key); + + //if last block was rounded up to beyond size of our data, set it to end of data + if (i == num_servers-1 && ((pskv.size+part_size) > final_size)) { + part_size = buf_size + 3*(num_servers-1) - pskv.size; + } + pskv.lens.push_back(part_size); + pskv.size += part_size; + } + CHECK_EQ(static_cast(pskv.size), final_size); + } else { + PSKV& push_pskv = push_ps_kv_[key]; + for (int i=0; i(pskv.size), size) << "The value size cannot be changed"; } else { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); @@ -532,7 +676,6 @@ class KVStoreDist : public KVStoreLocal { CHECK_EQ(static_cast(pskv.size), size); } } - return pskv; } @@ -606,10 +749,10 @@ class KVStoreDist : public KVStoreLocal { * \brief threshold for partition */ size_t bigarray_bound_; - std::unordered_map comm_buf_; /// \brief small buffer for quantize std::unordered_map comm_small_buf_; + std::unordered_map comm_small_send_buf_; /// \brief residual buffer for quantize std::unordered_map residual_; /// \brief threshold for quantize diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 26c181202f39..b600b66b9025 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -156,7 +156,6 @@ class KVStoreDistServer { sync_mode_ = true; } else if (recved.head == kSetCompress) { compress_ = recved.body; - std::cout<<"Setting compress to "<()+0)) +// <<" and "<<(*(recv_blob.dptr()+1))<<" "<<(*(recv_blob.dptr()+2))<< " "<<(*(recv_blob.dptr()+3))<()+2)); dshape = TShape{original_size}; if (decomp_buf.is_none()) { decomp_buf = NDArray(dshape, Context()); } + std::cout<<"in data handle of server, original size is "< Date: Mon, 2 Oct 2017 19:37:28 -0700 Subject: [PATCH 076/237] works most times. sometimes val instead of 0 has parts of 1 or 1.5... --- src/kvstore/kvstore_dist.h | 23 ++++++++++++-------- src/kvstore/kvstore_dist_server.h | 21 ++++++++++++++---- tests/nightly/dist_sync_kvstore.py | 35 +++++++++++++++++------------- 3 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index fd2487b2edca..2c68a874e285 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -181,7 +181,6 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals); - for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; // use the same array for merging to guarantee that pull always happens @@ -195,19 +194,21 @@ class KVStoreDist : public KVStoreLocal { recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_, true, grouped_vals[i][0]->dtype()); } - auto pull_from_servers = [this, key, recv_buf]( + bool is_compressed = (compress_!="none"); + auto pull_from_servers = [this, key, recv_buf, is_compressed]( RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - bool is_compressed = (compress_!="none"); PSKV& pskv = EncodeKey(key, size, false, is_compressed); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted + auto vals = new ps::SArray(data, size, false); // issue pull + CHECK_NOTNULL(ps_worker_)->ZPull( pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; @@ -220,7 +221,6 @@ class KVStoreDist : public KVStoreLocal { FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreDistDefaultPull")); - comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } } @@ -355,6 +355,7 @@ class KVStoreDist : public KVStoreLocal { void PushCompressed(int key, NDArray& comm_buf, NDArray &send_buf, int priority){ + std::cout<<"send buf data"<<* (float*) (send_buf.data().dptr_)<<" "<<* ((float*)send_buf.data().dptr_+3)< const_vars; @@ -380,6 +381,9 @@ class KVStoreDist : public KVStoreLocal { prev_to += pskv.lens[i]; prev_from += pskv.lens[i]-3; } + comm_small_send_buf.WaitToRead(); + std::cout<<"commsmallsendbuf data"<<* (float*) (comm_small_send_buf.data().dptr_)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+1)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+2)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+3)<(pskv.size), final_size); } else { + mu_.lock(); PSKV& push_pskv = push_ps_kv_[key]; + mu_.unlock(); for (int i=0; i()+0)) // <<" and "<<(*(recv_blob.dptr()+1))<<" "<<(*(recv_blob.dptr()+2))<< " "<<(*(recv_blob.dptr()+3))<()+2)); dshape = TShape{original_size}; if (decomp_buf.is_none()) { decomp_buf = NDArray(dshape, Context()); } - std::cout<<"in data handle of server, original size is "<Response(req_meta); stored.WaitToRead(); } +// std::cout<<"Server: Finished push"< response; diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 98ac0695dde4..e9e5fb43f001 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -68,15 +68,15 @@ def test_sync_push_pull(): kv, my_rank, nworker = init_kv() def check_default_keys(kv, my_rank, nworker): - nrepeat = 3 + nrepeat = 1 for i in range(nrepeat): - kv.push('3', mx.nd.ones(shape)*(my_rank+1)) + # kv.push('3', mx.nd.ones(shape)*(my_rank+1)) kv.push('99', mx.nd.ones(big_shape)*(my_rank+1)) num = (nworker + 1) * nworker * rate / 2 * nrepeat + 1 - val = mx.nd.zeros(shape) - kv.pull('3', out=val) - check_diff_to_scalar(val, num) + # val = mx.nd.zeros(shape) + # kv.pull('3', out=val) + # check_diff_to_scalar(val, num) val2 = mx.nd.zeros(big_shape) kv.pull('99', out=val2) @@ -178,7 +178,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): expected[row] = updated_val[row] check_diff_to_scalar(val, expected, rank=my_rank) - def verify_residual(kv, pos_threshold): + def verify_residual(kv, pos_threshold, nworker): for d in [('221', big_shape), ('21', shape)]: kv.push(d[0], mx.nd.ones(d[1])*0.4) val=mx.nd.zeros(d[1]) @@ -187,7 +187,7 @@ def verify_residual(kv, pos_threshold): kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold - 0.4)) val2 = mx.nd.zeros(d[1]) kv.pull(d[0],val2) - curval = pos_threshold * 2 + curval = pos_threshold * 2 * nworker check_diff_to_scalar(val2, curval) kv.push(d[0], mx.nd.ones(d[1])*0.2) val3= mx.nd.zeros(d[1]) @@ -196,18 +196,22 @@ def verify_residual(kv, pos_threshold): kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold-0.2)) val4 = mx.nd.zeros(d[1]) kv.pull(d[0],val4) - curval += pos_threshold*2 + curval += pos_threshold*2*nworker check_diff_to_scalar(val4, curval) - def check_ones(kv, pos): - kv.push('221',mx.nd.ones(big_shape)*pos*4) + def check_ones(kv, pos, nworker): val = mx.nd.zeros(big_shape) kv.pull('221', val) - curval = pos*2*3 - check_diff_to_scalar(val, curval) + curval = val[0][0].asnumpy()[0] + kv.push('221',mx.nd.ones(big_shape)*pos*4) + val2 = mx.nd.zeros(big_shape) + kv.pull('221', val2) + newval = curval + 2*nworker*pos + check_diff_to_scalar(val2, newval) def check_zero(kv): kv.push('221', mx.nd.zeros(big_shape)) + # to check that all are set to 0s val = mx.nd.ones(big_shape) kv.pull('221', val) check_diff_to_scalar(val, 0) @@ -219,9 +223,10 @@ def check_zero(kv): print('worker ' + str(my_rank) + ' is done with non compression tests') kv, pos, neg = init_kv_compressed(kv) - check_zero(kv) - verify_residual(kv, pos) - check_ones(kv, pos) + # print ('pushing now') + # check_zero(kv) + # verify_residual(kv, pos, nworker) + # check_ones(kv, pos, nworker) print('worker ' + str(my_rank) + ' is done with compression tests') if __name__ == "__main__": From 0bc1da3a414457949273450675f0b57c0285f277 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 3 Oct 2017 17:04:33 -0700 Subject: [PATCH 077/237] fix read lock issue --- src/kvstore/kvstore_dist.h | 112 ++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 38 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 2c68a874e285..b97367f730f4 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -33,6 +33,8 @@ #include "ps/ps.h" #include "./kvstore_dist_server.h" #include "../ndarray/ndarray_function.h" +#include // for uint32_t + #if MKL_EXPERIMENTAL == 1 #include #include "../operator/mkl/mkl_memory-inl.h" @@ -51,7 +53,27 @@ namespace kvstore { * it's the server node's job to control the data consistency among all * workers. see details on \ref ServerHandle::Start */ -class KVStoreDist : public KVStoreLocal { + + void floatToBinary(float f, std::string& str) + { + union { float f; uint32_t i; } u; + u.f = f; + str.clear(); + + for (int i = 0; i < 32; i++) + { + if (u.i % 2) str.push_back('1'); + else str.push_back('0'); + u.i >>= 1; + } + + // Reverse the string since now it's backwards + std::string temp(str.rbegin(), str.rend()); + str = temp; + } + + + class KVStoreDist : public KVStoreLocal { public: explicit KVStoreDist(bool use_device_comm) : KVStoreLocal(use_device_comm), ps_worker_(nullptr), server_(nullptr) { @@ -324,10 +346,15 @@ class KVStoreDist : public KVStoreLocal { } if (compress_ == "2bit") { +// comm_buf.WaitToRead(); +// for (int i=0; i foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); //std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " // << *(((float *) small_buf.data().dptr_)+1) << " " @@ -355,7 +382,15 @@ class KVStoreDist : public KVStoreLocal { void PushCompressed(int key, NDArray& comm_buf, NDArray &send_buf, int priority){ - std::cout<<"send buf data"<<* (float*) (send_buf.data().dptr_)<<" "<<* ((float*)send_buf.data().dptr_+3)< const_vars; @@ -382,8 +417,14 @@ class KVStoreDist : public KVStoreLocal { prev_from += pskv.lens[i]-3; } comm_small_send_buf.WaitToRead(); - std::cout<<"commsmallsendbuf data"<<* (float*) (comm_small_send_buf.data().dptr_)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+1)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+2)<<" "<<* ((float*)comm_small_send_buf.data().dptr_+3)<GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); if (!pskv.keys.empty()) { - //will fail -// CHECK_EQ(static_cast(pskv.size), size)<< "The value size cannot be changed"; + if (is_push) { + CHECK_EQ(static_cast(pskv.size), buf_size+3*(num_servers-1))<< "The value size can't be changed"; + } else { + CHECK_EQ(static_cast(pskv.size), original_size)<< "The value size can't be changed"; + } } else { - auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); - int num_servers = krs.size(); - CHECK_GT(num_servers, 0); // a simple heuristic for load balance if (original_size < bigarray_bound_) { // send it to a single random picked server @@ -605,38 +649,30 @@ class KVStoreDist : public KVStoreLocal { } else { // partition it to all servers pskv.size = 0; - size_t final_size; - if (is_push) { - final_size = buf_size+3*(num_servers-1); - for (int i = 0; i < num_servers; ++i) { - //if pushing, divide size of compressed array into blocks of 16, so we don't split between a compressed value - //if pulling, need to divide exact way as above did - size_t part_size = is_push? (roundUp((size-3)/num_servers*(i+1), 16) - roundUp((size-3)/num_servers*(i), 16) + 3) - : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); - ps::Key ps_key = krs[i].begin() + key; - CHECK_LT(ps_key, krs[i].end()); - pskv.keys.push_back(ps_key); - - //if last block was rounded up to beyond size of our data, set it to end of data - if (i == num_servers-1 && ((pskv.size+part_size) > final_size)) { - part_size = buf_size + 3*(num_servers-1) - pskv.size; - } - pskv.lens.push_back(part_size); - pskv.size += part_size; + size_t final_size = buf_size+3*(num_servers-1); + for (int i = 0; i < num_servers; ++i) { + //if pushing, divide size of compressed array into blocks of 16, so we don't split between a compressed value + size_t part_size = is_push? (roundUp((size-3)/num_servers*(i+1), 16) - roundUp((size-3)/num_servers*(i), 16) + 3) + : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); + ps::Key ps_key = krs[i].begin() + key; + CHECK_LT(ps_key, krs[i].end()); + pskv.keys.push_back(ps_key); + + //if last block was rounded up to beyond size of our data, set it to end of data + if (i == num_servers-1 && ((pskv.size+part_size) > final_size)) { + part_size = buf_size + 3*(num_servers-1) - pskv.size; } + pskv.lens.push_back(part_size); + pskv.size += part_size; + } + if (is_push) { CHECK_EQ(static_cast(pskv.size), final_size); } else { - mu_.lock(); - PSKV& push_pskv = push_ps_kv_[key]; - mu_.unlock(); - for (int i=0; i(pskv.size), original_size); } - } } return pskv; From d120e9a2eb79fbfded5b236c2a0139934f1b4afc Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 3 Oct 2017 17:05:09 -0700 Subject: [PATCH 078/237] prev commit fixed seg fault issue on pull without push in a server --- tests/nightly/dist_sync_kvstore.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index e9e5fb43f001..5dc3072306b3 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -60,7 +60,7 @@ def init_kv_compressed(kv): #kv.set_optimizer(mx.optimizer.create('test')) # init kv compression keys kv.init('221', mx.nd.zeros(big_shape)) - kv.init('21', mx.nd.zeros(shape)) + # kv.init('21', mx.nd.zeros(shape)) #kv.set_optimizer(mx.optimizer.create('test')) return kv, pos_threshold, neg_threshold @@ -216,15 +216,15 @@ def check_zero(kv): kv.pull('221', val) check_diff_to_scalar(val, 0) - check_default_keys(kv, my_rank, nworker) - check_row_sparse_keys(kv, my_rank, nworker) - check_row_sparse_keys_with_zeros(kv, my_rank, nworker) - check_big_row_sparse_keys(kv, my_rank, nworker) - print('worker ' + str(my_rank) + ' is done with non compression tests') + # check_default_keys(kv, my_rank, nworker) + # check_row_sparse_keys(kv, my_rank, nworker) + # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + # check_big_row_sparse_keys(kv, my_rank, nworker) + # print('worker ' + str(my_rank) + ' is done with non compression tests') kv, pos, neg = init_kv_compressed(kv) # print ('pushing now') - # check_zero(kv) + check_zero(kv) # verify_residual(kv, pos, nworker) # check_ones(kv, pos, nworker) print('worker ' + str(my_rank) + ' is done with compression tests') From 4d5315a99b0da69549b79ac8d9f660c85adc4f80 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 3 Oct 2017 18:25:17 -0700 Subject: [PATCH 079/237] add waittowrite to fix pull before push problems --- src/kvstore/kvstore_dist.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index b97367f730f4..0259244a32ae 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -180,8 +180,9 @@ namespace kvstore { if (get_rank() == 0) { Push_(keys, values, 0, false); // wait until the push is finished - for (const auto& v : values) { - v.WaitToWrite(); + for (const int key : keys) { + comm_buf_[key].WaitToWrite(); + comm_small_send_buf_[key].WaitToWrite(); } } else { // do nothing @@ -321,6 +322,7 @@ namespace kvstore { } CopyFromTo(merged, &comm_buf); } + auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; if (compress_ != "none") { @@ -347,13 +349,14 @@ namespace kvstore { if (compress_ == "2bit") { // comm_buf.WaitToRead(); +// std::cout<<"waiting done"< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); //std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " From 13b2ce5ec3866f73221355f410e2cdb368976a15 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 5 Oct 2017 14:02:27 -0700 Subject: [PATCH 080/237] refactor quantizing for sharded data --- src/kvstore/kvstore_dist.h | 191 ++++++++++++++--------------- tests/nightly/dist_sync_kvstore.py | 24 ++-- 2 files changed, 110 insertions(+), 105 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 0259244a32ae..62fe0e396e77 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -182,7 +182,7 @@ namespace kvstore { // wait until the push is finished for (const int key : keys) { comm_buf_[key].WaitToWrite(); - comm_small_send_buf_[key].WaitToWrite(); + comm_small_buf_[key].WaitToWrite(); } } else { // do nothing @@ -326,16 +326,13 @@ namespace kvstore { auto& small_buf = comm_small_buf_[key]; auto& res_buf = residual_[key]; if (compress_ != "none") { + PSKV& pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), true); // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { - int bits = compress_ == "2bit" ? 16 : 32; - long int small_size = merged.shape().Size() % bits == 0 ? - merged.shape().Size() / bits + 3 : - merged.shape().Size() / bits + 4; // small buffer for quantize - small_buf = NDArray(TShape{small_size}, comm_buf.ctx(), false, comm_buf.dtype()); + small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); // residual buffer for quantize - res_buf = NDArray(merged.shape(), comm_buf.ctx(), false, comm_buf.dtype()); + res_buf = NDArray(TShape{(long int) comm_buf.shape().Size()}, comm_buf.ctx(), false, comm_buf.dtype()); res_buf = 0; if (pos_thre_.is_none()) { // positive threshold @@ -348,6 +345,7 @@ namespace kvstore { } if (compress_ == "2bit") { + Compress(comm_buf, &small_buf, &res_buf, pskv, priority); // comm_buf.WaitToRead(); // std::cout<<"waiting done"< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); @@ -383,76 +381,33 @@ namespace kvstore { } } - void PushCompressed(int key, NDArray& comm_buf, NDArray &send_buf, int priority){ -// for (int i=3; i const_vars; - const_vars.push_back(comm_buf.var()); - if (comm_buf.shape().Size() > bigarray_bound_) { - if (comm_small_send_buf.is_none()) { - comm_small_send_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); - } - size_t prev_from = 3; - size_t prev_to = 0; - int cursize = 0; - int original_size = comm_buf.shape().Size(); - CHECK_GT(original_size,0); - NDArray meta = send_buf.Slice(0,3); - for(size_t i = 0; i const_vars; +// const_vars.push_back(comm_buf.var()); +// const_vars.push_back(comm_small_send_buf.var()); +// } else { +// if compress is set, then send_buf is different from comm_buf +// const_vars.push_back(send_buf.var()); +// } auto push_to_servers = - [this, key, pskv, comm_buf, &comm_small_send_buf, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, comm_buf, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = 0; real_t* data = nullptr; - if ( comm_buf.shape().Size() > bigarray_bound_ ){ - data = comm_small_send_buf.data().dptr(); - #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(comm_small_send_buf.data()); - #endif - size = comm_small_send_buf.shape().Size(); - } else { - data = send_buf.data().dptr(); + PSKV& pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), true); + data = send_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif size = send_buf.shape().Size(); - } // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -461,7 +416,7 @@ namespace kvstore { Engine::Get()->PushAsync( push_to_servers, pinned_ctx_, - const_vars, + {send_buf.var(), comm_buf.var()}, {}, FnProperty::kNormal, priority, @@ -609,7 +564,36 @@ namespace kvstore { } - PSKV& EncodeKey(int key, size_t size, bool is_push, bool is_compressed) { + void Compress(NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, PSKV& pskv, int priority){ + size_t orig_size = comm_buf.shape().Size(); + NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(long int) orig_size}); + + //should be start of data in original commbuf + size_t cur_from = 0; + //should be start of meta in new small_buf +// std::cout<<"max_from"<Slice(cur_to, cur_to+pskv.lens[i]); + + size_t end_part_data = cur_from + (pskv.lens[i] -3 )* 16; + if (end_part_data > orig_size) { + end_part_data = orig_size; + } + NDArray fromdata = flattened_comm_buf.Slice(cur_from, end_part_data); + NDArray respart = res_buf->Slice(cur_from, end_part_data); + Quantize(fromdata, &part_compr, &respart, pos_thre_, neg_thre_, compress_, priority); + cur_from = end_part_data; + cur_to = cur_to + pskv.lens[i]; + } + CHECK_EQ(cur_to, small_buf->shape().Size()); + } + + PSKV& EncodeKey(int key, size_t size, bool is_push, bool is_compressed) { if (is_compressed) { return EncodeCompressedKey(key, size, is_push); } else { @@ -624,21 +608,36 @@ namespace kvstore { * send_buf in this case will add few counts of meta information * to each part if divided */ - inline PSKV& EncodeCompressedKey(int key, size_t buf_size, bool is_push) { - size_t original_size = comm_buf_[key].shape().Size(); - size_t size = (is_push) ? buf_size : original_size; - mu_.lock(); - PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; - mu_.unlock(); + inline PSKV& EncodeCompressedKey(int key, size_t original_size, bool is_push) { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); CHECK_GT(num_servers, 0); - if (!pskv.keys.empty()) { - if (is_push) { - CHECK_EQ(static_cast(pskv.size), buf_size+3*(num_servers-1))<< "The value size can't be changed"; + + int bits = compress_ == "2bit" ? 16 : 32; + size_t size = 0; + size_t size_data = original_size % bits == 0 ? + original_size / bits : + original_size / bits + 1; + if (is_push) { + if (original_size >= bigarray_bound_) { + size = original_size % bits == 0 ? + original_size / bits + (3*num_servers): + original_size / bits + 1 + (3*num_servers); } else { - CHECK_EQ(static_cast(pskv.size), original_size)<< "The value size can't be changed"; + size = original_size % bits == 0 ? + original_size / bits + 3: + original_size / bits + 4; } + } else { + size = original_size; + } + + mu_.lock(); + PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; + mu_.unlock(); + + if (!pskv.keys.empty()) { + CHECK_EQ(static_cast(pskv.size), size)<< "The value size can't be changed"; } else { // a simple heuristic for load balance if (original_size < bigarray_bound_) { @@ -652,31 +651,32 @@ namespace kvstore { } else { // partition it to all servers pskv.size = 0; - size_t final_size = buf_size+3*(num_servers-1); for (int i = 0; i < num_servers; ++i) { //if pushing, divide size of compressed array into blocks of 16, so we don't split between a compressed value - size_t part_size = is_push? (roundUp((size-3)/num_servers*(i+1), 16) - roundUp((size-3)/num_servers*(i), 16) + 3) - : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); + size_t part_size = roundUp(size_data/num_servers*(i+1), 16) - roundUp(size_data/num_servers*(i), 16) + 3; +// : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); ps::Key ps_key = krs[i].begin() + key; CHECK_LT(ps_key, krs[i].end()); pskv.keys.push_back(ps_key); //if last block was rounded up to beyond size of our data, set it to end of data - if (i == num_servers-1 && ((pskv.size+part_size) > final_size)) { - part_size = buf_size + 3*(num_servers-1) - pskv.size; + if (i == num_servers-1 && ((pskv.size+part_size) > size)) { + part_size = size - pskv.size; } - pskv.lens.push_back(part_size); - pskv.size += part_size; - } - if (is_push) { - CHECK_EQ(static_cast(pskv.size), final_size); - } else { - for (int i=0; i < num_servers; ++i) { - pskv.lens[i] = (pskv.lens[i]-3)*16; + if(is_push) { + pskv.lens.push_back(part_size); + pskv.size += part_size; + } else { + pskv.lens.push_back((part_size-3)*16); + pskv.size += (part_size-3)*16; } - CHECK_EQ(static_cast(pskv.size), original_size); + std::cout<<"pskv len "<(pskv.size), size); + CHECK_EQ(pskv.lens.size(), num_servers); } + } return pskv; } @@ -796,7 +796,6 @@ namespace kvstore { std::unordered_map comm_buf_; /// \brief small buffer for quantize std::unordered_map comm_small_buf_; - std::unordered_map comm_small_send_buf_; /// \brief residual buffer for quantize std::unordered_map residual_; /// \brief threshold for quantize diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 5dc3072306b3..7374e16a1f81 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -60,7 +60,7 @@ def init_kv_compressed(kv): #kv.set_optimizer(mx.optimizer.create('test')) # init kv compression keys kv.init('221', mx.nd.zeros(big_shape)) - # kv.init('21', mx.nd.zeros(shape)) + kv.init('21', mx.nd.zeros(shape)) #kv.set_optimizer(mx.optimizer.create('test')) return kv, pos_threshold, neg_threshold @@ -209,6 +209,11 @@ def check_ones(kv, pos, nworker): newval = curval + 2*nworker*pos check_diff_to_scalar(val2, newval) + def check_pull_before_push(kv): + val = mx.nd.ones(big_shape) + kv.pull('221', val) + check_diff_to_scalar(val, 0) + def check_zero(kv): kv.push('221', mx.nd.zeros(big_shape)) # to check that all are set to 0s @@ -216,17 +221,18 @@ def check_zero(kv): kv.pull('221', val) check_diff_to_scalar(val, 0) - # check_default_keys(kv, my_rank, nworker) - # check_row_sparse_keys(kv, my_rank, nworker) - # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) - # check_big_row_sparse_keys(kv, my_rank, nworker) - # print('worker ' + str(my_rank) + ' is done with non compression tests') + print ('worker '+str(my_rank)+' started') + check_default_keys(kv, my_rank, nworker) + check_row_sparse_keys(kv, my_rank, nworker) + check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + check_big_row_sparse_keys(kv, my_rank, nworker) + print('worker ' + str(my_rank) + ' is done with non compression tests') kv, pos, neg = init_kv_compressed(kv) - # print ('pushing now') + check_pull_before_push(kv) check_zero(kv) - # verify_residual(kv, pos, nworker) - # check_ones(kv, pos, nworker) + verify_residual(kv, pos, nworker) + check_ones(kv, pos, nworker) print('worker ' + str(my_rank) + ' is done with compression tests') if __name__ == "__main__": From 648e0e9f01c1a08a197a067a0de84ba65490d16c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 5 Oct 2017 17:57:15 -0700 Subject: [PATCH 081/237] redo break up of data across servers,clearer split --- src/kvstore/kvstore_dist.h | 83 ++++++++++++++++-------------- tests/nightly/dist_sync_kvstore.py | 22 ++++---- 2 files changed, 55 insertions(+), 50 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 62fe0e396e77..72ac4e68d06e 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -557,12 +557,6 @@ namespace kvstore { */ std::mutex mu_; - size_t roundUp(size_t numToRound, size_t multiple) - { - assert(multiple && ((multiple & (multiple - 1)) == 0)); - return (numToRound + multiple - 1) & -multiple; - } - void Compress(NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, PSKV& pskv, int priority){ size_t orig_size = comm_buf.shape().Size(); @@ -612,21 +606,17 @@ namespace kvstore { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); CHECK_GT(num_servers, 0); - int bits = compress_ == "2bit" ? 16 : 32; + //size of data to be sent size_t size = 0; - size_t size_data = original_size % bits == 0 ? - original_size / bits : - original_size / bits + 1; if (is_push) { if (original_size >= bigarray_bound_) { - size = original_size % bits == 0 ? - original_size / bits + (3*num_servers): - original_size / bits + 1 + (3*num_servers); + size = (size_t) num_servers * ((original_size / num_servers) % bits == 0 ? + (original_size/num_servers) / bits + 3 : + (original_size/num_servers) / bits + 4); } else { size = original_size % bits == 0 ? - original_size / bits + 3: - original_size / bits + 4; + original_size / bits + 3: original_size / bits + 4; } } else { size = original_size; @@ -639,45 +629,58 @@ namespace kvstore { if (!pskv.keys.empty()) { CHECK_EQ(static_cast(pskv.size), size)<< "The value size can't be changed"; } else { - // a simple heuristic for load balance + // populate both pull and push pskvs + mu_.lock(); + PSKV& pull_pskv = pull_ps_kv_[key]; + PSKV& push_pskv = push_ps_kv_[key]; + mu_.unlock(); + if (original_size < bigarray_bound_) { + // a simple heuristic for load balancing // send it to a single random picked server int server = (key * 9973) % num_servers; ps::Key ps_key = krs[server].begin() + key; CHECK_LT(ps_key, krs[server].end()); - pskv.keys.push_back(ps_key); - pskv.lens.push_back(size); - pskv.size = size; + push_pskv.keys.push_back(ps_key); + pull_pskv.keys.push_back(ps_key); + push_pskv.lens.push_back(size); + pull_pskv.lens.push_back(original_size); + push_pskv.size = size; + pull_pskv.size = original_size; } else { // partition it to all servers - pskv.size = 0; + push_pskv.size = 0; + pull_pskv.size = 0; for (int i = 0; i < num_servers; ++i) { - //if pushing, divide size of compressed array into blocks of 16, so we don't split between a compressed value - size_t part_size = roundUp(size_data/num_servers*(i+1), 16) - roundUp(size_data/num_servers*(i), 16) + 3; -// : (roundUp((size)/num_servers*(i+1), 1) - roundUp((size)/num_servers*(i), 1)); - ps::Key ps_key = krs[i].begin() + key; - CHECK_LT(ps_key, krs[i].end()); - pskv.keys.push_back(ps_key); - - //if last block was rounded up to beyond size of our data, set it to end of data - if (i == num_servers-1 && ((pskv.size+part_size) > size)) { - part_size = size - pskv.size; - } - if(is_push) { - pskv.lens.push_back(part_size); - pskv.size += part_size; - } else { - pskv.lens.push_back((part_size-3)*16); - pskv.size += (part_size-3)*16; + size_t part_orig = static_cast ( + lround(static_cast(original_size) / num_servers * (i + 1)) - + lround(static_cast(original_size) / num_servers * i)); +// if(get_rank()==0) std::cout<<"part_orig "< original_size) { + part_orig = original_size - pskv.size; } - std::cout<<"pskv len "<(pskv.size), size); + CHECK_EQ(static_cast(pull_pskv.size), original_size); CHECK_EQ(pskv.lens.size(), num_servers); +// std::cout<<"set pull pskv for key:"< Date: Sun, 8 Oct 2017 18:20:13 -0700 Subject: [PATCH 082/237] refactor to use param for thresholds. also cleans up code --- include/mxnet/ndarray.h | 2 +- src/kvstore/comm.h | 81 ++---- src/kvstore/kvstore_dist.h | 264 +++++++++----------- src/kvstore/kvstore_dist_server.h | 26 +- src/ndarray/ndarray.cc | 15 +- src/ndarray/ndarray_function.cc | 5 +- src/ndarray/ndarray_function.h | 3 +- src/operator/contrib/two_bit_quantize-inl.h | 101 ++++---- src/operator/contrib/two_bit_quantize.cc | 22 +- 9 files changed, 214 insertions(+), 305 deletions(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 692f56d2ca19..0a9c3fcdbef8 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -906,7 +906,7 @@ size_t num_aux_data(NDArrayStorageType stype); void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); void Quantize(const NDArray &from, NDArray *to, NDArray *residual, - const NDArray &pos_threshold, const NDArray &neg_threshold, + const float neg_threshold, const float pos_threshold, std::string& compress, int priority); void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority); diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 5cb89629183c..ff1c31609905 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -83,7 +83,7 @@ class Comm { /** * \brief set to use low-bit compression */ - void SetCompress(const std::string& compress, + inline void SetCompress(const std::string& compress, float const pos_threshold, float const neg_threshold) { compress_ = compress; @@ -519,38 +519,31 @@ class CommDevice : public Comm { // NDArray.Slice or gpu direct memory access. for the latter, we need to // remove some ctx check, and also it reduces 20% perf buf.copy_buf.resize(src.size()); - buf.small_recv_buf.resize(src.size()); - buf.small_send_buf.resize(src.size()); - buf.residual.resize(src.size()); - pos_thre.resize(src.size()); - neg_thre.resize(src.size()); + + if (compress_!="none") { + // one buf for each context + buf.small_recv_buf.resize(src.size()); + buf.small_send_buf.resize(src.size()); + buf.residual.resize(src.size()); + } for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); - // allocation small buffer for compressed data if (compress_.compare("none") != 0) { - // Residual buf.residual[i] = NDArray( buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); buf.residual[i] = 0; - // recv buffer and send buffer - int bits = compress_ == "2bit" ? 16 : 32; - long int small_size = buf.merged.shape().Size() % bits == 0 ? - buf.merged.shape().Size() / bits + 3 : - buf.merged.shape().Size() / bits + 4; - buf.small_recv_buf[i] = NDArray( - TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); - buf.small_send_buf[i] = NDArray( - TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); - // The positive and negative threshold + int bits; if (compress_.compare("2bit") == 0) { - pos_thre[i] = NDArray( - TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - pos_thre[i] = pos_threshold_; - neg_thre[i] = NDArray( - TShape{1}, src[i].ctx(), false, buf.merged.dtype()); - neg_thre[i] = neg_threshold_; + bits = 16; + long int small_size = buf.merged.shape().Size() % bits == 0 ? + buf.merged.shape().Size() / bits + 3 : + buf.merged.shape().Size() / bits + 4; + buf.small_recv_buf[i] = NDArray( + TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); + buf.small_send_buf[i] = NDArray( + TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); } } } @@ -558,44 +551,19 @@ class CommDevice : public Comm { for (size_t i = 0; i < src.size(); ++i) { // compress before copy - if (compress_.compare("2bit") == 0) { - // TODO: New code: wrapper for NDArray quantize_2bit op - /* - Compress2Bit(src[i], buf.residual[i], - pos_thre[i], neg_thre[i], - &(buf.small_send_buf[i]), priority); - CopyFromTo(buf.small_send_buf[i], - &(buf.small_recv_buf[i]), - priority); - DeCompress2Bit(buf.small_recv_buf[i], - &(buf.copy_buf[i]), - priority); - */ + if (compress_=="none") { + CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + } else if (compress_ == "2bit") { Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), - pos_thre[i], neg_thre[i], compress_, priority); + neg_threshold_, pos_threshold_, compress_, priority); CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); - } else if (compress_.compare("1bit") == 0) { - // TODO: New code: wrapper for NDArray quantize_1bit op - /* - Compress1Bit(src[i], buf.residual[i], - &(buf.small_send_buf[i]), - priority); - CopyFromTo(buf.small_send_buf[i], - &(buf.small_recv_buf[i]), - priority); - DeCompress1Bit(buf.small_recv_buf[i], - &(buf.copy_buf[i]), - priority); - */ - } else { // Do not compress - CopyFromTo(src[i], &(buf.copy_buf[i]), priority); + } else { + LOG(FATAL) << "Unsupported compression "< merge_buf_; - // \brief the positive and negative threshold - std::vector pos_thre; - std::vector neg_thre; bool inited_; }; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 72ac4e68d06e..635e2f13d718 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -182,7 +182,7 @@ namespace kvstore { // wait until the push is finished for (const int key : keys) { comm_buf_[key].WaitToWrite(); - comm_small_buf_[key].WaitToWrite(); + compr_buf_[key].WaitToWrite(); } } else { // do nothing @@ -217,12 +217,11 @@ namespace kvstore { recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_, true, grouped_vals[i][0]->dtype()); } - bool is_compressed = (compress_!="none"); - auto pull_from_servers = [this, key, recv_buf, is_compressed]( + auto pull_from_servers = [this, key, recv_buf]( RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size, false, is_compressed); + PSKV& pskv = EncodeKey(key, size, false); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -303,11 +302,11 @@ namespace kvstore { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; - const auto& vals = grouped_vals[i]; + const auto &vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); - auto& comm_buf = comm_buf_[key]; + auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { // make sure the previous push/pull is completed comm_buf.WaitToWrite(); @@ -323,106 +322,44 @@ namespace kvstore { CopyFromTo(merged, &comm_buf); } - auto& small_buf = comm_small_buf_[key]; - auto& res_buf = residual_[key]; if (compress_ != "none") { - PSKV& pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), true); + auto &small_buf = compr_buf_[key]; + auto &res_buf = residual_[key]; + size_t original_size = comm_buf.shape().Size(); + PSKV &pskv = EncodeCompressedKey(key, original_size, true); // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { // small buffer for quantize small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); // residual buffer for quantize - res_buf = NDArray(TShape{(long int) comm_buf.shape().Size()}, comm_buf.ctx(), false, comm_buf.dtype()); + res_buf = NDArray(TShape{(long int) original_size}, comm_buf.ctx(), false, comm_buf.dtype()); res_buf = 0; - if (pos_thre_.is_none()) { - // positive threshold - pos_thre_ = NDArray(TShape{1}, comm_buf.ctx(), false, mshadow::kFloat32); - pos_thre_ = pos_threshold_; - // negative threshold - neg_thre_ = NDArray(TShape{1}, comm_buf.ctx(), false, mshadow::kFloat32); - neg_thre_ = neg_threshold_; - } } if (compress_ == "2bit") { Compress(comm_buf, &small_buf, &res_buf, pskv, priority); -// comm_buf.WaitToRead(); -// std::cout<<"waiting done"< foo(*reinterpret_cast((((float *) small_buf.data().dptr_)+3))); - //std::cout<<"Compressed buf is "<<*((float *) small_buf.data().dptr_)<<" " - // << *(((float *) small_buf.data().dptr_)+1) << " " - // << *(((float *) small_buf.data().dptr_)+2) << " " - // << foo << " " << *(((float *) small_buf.data().dptr_)+3) << std::endl; - //std::cout<<"Res buf is "<< *((float *) res_buf.data().dptr_) < const_vars; -// const_vars.push_back(comm_buf.var()); -// const_vars.push_back(comm_small_send_buf.var()); -// } else { -// if compress is set, then send_buf is different from comm_buf -// const_vars.push_back(send_buf.var()); -// } - auto push_to_servers = - [this, key, comm_buf, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { - // convert to ps keys - size_t size = 0; - real_t* data = nullptr; - PSKV& pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), true); - data = send_buf.data().dptr(); - #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); - #endif - size = send_buf.shape().Size(); - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); - }; - Engine::Get()->PushAsync( - push_to_servers, - pinned_ctx_, - {send_buf.var(), comm_buf.var()}, - {}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistCompressedPush")); - } - void PushDefault(int key, NDArray &send_buf, int priority){ auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { @@ -434,7 +371,7 @@ namespace kvstore { #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif - PSKV& pskv = EncodeKey(key, size, true); + PSKV& pskv = EncodeDefaultKey(key, size, true); // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -553,80 +490,114 @@ namespace kvstore { std::unordered_map push_ps_kv_; std::unordered_map pull_ps_kv_; /** - * \brief serizelize EncodeRowSparseKey and EncodeKey + * \brief serialize EncodeRowSparseKey and EncodeKey */ std::mutex mu_; + void PushCompressed(int key, NDArray& comm_buf, NDArray &small_buf, PSKV& pskv, int priority){ + auto push_to_servers = + [this, key, comm_buf, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + // convert to ps keys + size_t size = 0; + real_t* data = nullptr; + size = small_buf.shape().Size(); + data = small_buf.data().dptr(); + #if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(small_buf.data()); + #endif + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + }; + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {small_buf.var(), comm_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistCompressedPush")); + } - void Compress(NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, PSKV& pskv, int priority){ - size_t orig_size = comm_buf.shape().Size(); - NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(long int) orig_size}); - - //should be start of data in original commbuf - size_t cur_from = 0; - //should be start of meta in new small_buf -// std::cout<<"max_from"<Slice(cur_to, cur_to+pskv.lens[i]); - - size_t end_part_data = cur_from + (pskv.lens[i] -3 )* 16; - if (end_part_data > orig_size) { - end_part_data = orig_size; - } - NDArray fromdata = flattened_comm_buf.Slice(cur_from, end_part_data); - NDArray respart = res_buf->Slice(cur_from, end_part_data); - Quantize(fromdata, &part_compr, &respart, pos_thre_, neg_thre_, compress_, priority); - cur_from = end_part_data; - cur_to = cur_to + pskv.lens[i]; + + void Compress(NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, PSKV& pskv, int priority){ + size_t orig_size = comm_buf.shape().Size(); + NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(long int) orig_size}); + int bits; + if (compress_ == "2bit") { + bits = 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + } + //should be start of data in original commbuf + size_t cur_from = 0; + //should be start of meta in new small_buf + size_t cur_to = 0; + for(int i=0; iSlice(cur_to, cur_to+pskv.lens[i]); + // removing the 3 values from pskv length which are meta data + size_t end_part_data = cur_from + (pskv.lens[i] - 3 )* bits; + // don't exceed origin_size + if (end_part_data > orig_size) { + end_part_data = orig_size; } - CHECK_EQ(cur_to, small_buf->shape().Size()); + NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); + NDArray part_res = res_buf->Slice(cur_from, end_part_data); + Quantize(part_data, &part_compr, &part_res, neg_threshold_, pos_threshold_, compress_, priority); + cur_from = end_part_data; + cur_to = cur_to + pskv.lens[i]; } + CHECK_EQ(cur_from, orig_size); + CHECK_EQ(cur_to, small_buf->shape().Size()); + } - PSKV& EncodeKey(int key, size_t size, bool is_push, bool is_compressed) { - if (is_compressed) { + PSKV& EncodeKey(int key, size_t size, bool is_push) { + if (compress_!="none") { return EncodeCompressedKey(key, size, is_push); } else { - return EncodeKey(key, size, is_push); + return EncodeDefaultKey(key, size, is_push); } } /** - * \brief convert to keys in ps for compressed values - * \brief buf_size will be size of recv_buf (original size) if pull - * buf_size will be size of quantized array if push. Actual size of - * send_buf in this case will add few counts of meta information - * to each part if divided + * \brief Convert to keys in ps for compressed values + * \brief Divides original array into equal parts for each server + * with space for meta info */ inline PSKV& EncodeCompressedKey(int key, size_t original_size, bool is_push) { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); CHECK_GT(num_servers, 0); - int bits = compress_ == "2bit" ? 16 : 32; - //size of data to be sent - size_t size = 0; - if (is_push) { - if (original_size >= bigarray_bound_) { - size = (size_t) num_servers * ((original_size / num_servers) % bits == 0 ? - (original_size/num_servers) / bits + 3 : - (original_size/num_servers) / bits + 4); - } else { - size = original_size % bits == 0 ? - original_size / bits + 3: original_size / bits + 4; - } + int bits; + if (compress_ == "2bit") { + bits = 16; + } else { + LOG(FATAL)<<"Unsupported compression type"; + } + // represents size of data to be sent +// size_t size = 0; + size_t compr_size = 0; + // add 3 values as meta info + if (original_size >= bigarray_bound_) { + compr_size = num_servers * ((original_size/num_servers) % bits == 0 ? + (original_size/num_servers)/bits + 3 : + (original_size/num_servers)/bits + 4); } else { - size = original_size; + compr_size = original_size % bits == 0 ? + original_size / bits + 3: original_size / bits + 4; } +// size = original_size; + mu_.lock(); PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; mu_.unlock(); if (!pskv.keys.empty()) { + size_t size = (is_push) ? compr_size : original_size; CHECK_EQ(static_cast(pskv.size), size)<< "The value size can't be changed"; } else { // populate both pull and push pskvs @@ -643,19 +614,18 @@ namespace kvstore { CHECK_LT(ps_key, krs[server].end()); push_pskv.keys.push_back(ps_key); pull_pskv.keys.push_back(ps_key); - push_pskv.lens.push_back(size); + push_pskv.lens.push_back(compr_size); pull_pskv.lens.push_back(original_size); - push_pskv.size = size; + push_pskv.size = compr_size; pull_pskv.size = original_size; } else { // partition it to all servers push_pskv.size = 0; pull_pskv.size = 0; for (int i = 0; i < num_servers; ++i) { - size_t part_orig = static_cast ( - lround(static_cast(original_size) / num_servers * (i + 1)) - - lround(static_cast(original_size) / num_servers * i)); -// if(get_rank()==0) std::cout<<"part_orig "< (round(static_cast(original_size)/num_servers*(i+1))) - + static_cast (round(static_cast(original_size)/num_servers*(i))); // if block was rounded up to beyond size of our data, set it to end of data if (part_orig + pskv.size > original_size) { part_orig = original_size - pskv.size; @@ -672,13 +642,10 @@ namespace kvstore { pull_pskv.lens.push_back(part_orig); push_pskv.size += compr_split; pull_pskv.size += part_orig; -// if (get_rank()==0) std::cout << "push key: " << key << " pskv len " << pskv.lens[i] << " " << std::endl; -// if (get_rank()==0) std::cout << "pull key: " << key << " pskv len " << pull_pskv.lens[i] << " " << std::endl; } - CHECK_EQ(static_cast(pskv.size), size); + CHECK_EQ(static_cast(push_pskv.size), compr_size); CHECK_EQ(static_cast(pull_pskv.size), original_size); - CHECK_EQ(pskv.lens.size(), num_servers); -// std::cout<<"set pull pskv for key:"< comm_buf_; /// \brief small buffer for quantize - std::unordered_map comm_small_buf_; + std::unordered_map compr_buf_; /// \brief residual buffer for quantize std::unordered_map residual_; - /// \brief threshold for quantize - NDArray pos_thre_; - NDArray neg_thre_; bool log_verbose_; }; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 3d65bb1e78c1..fa1ea2815337 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -387,8 +387,6 @@ class KVStoreDistServer { NDArray recved = NDArray(recv_blob, 0); NDArray decomp_buf = decomp_buf_[key]; if (compress_ != "none") { -// std::cout<<"threshold value is "<<(*(recv_blob.dptr()+0)) -// <<" and "<<(*(recv_blob.dptr()+1))<<" "<<(*(recv_blob.dptr()+2))<< " "<<(*(recv_blob.dptr()+3))<()+2)); dshape = TShape{original_size}; if (decomp_buf.is_none()) { @@ -416,29 +414,15 @@ class KVStoreDistServer { if (compress_ == "none") { CopyFromTo(recved, &merged.array, 0); } else { -// std::cout<<"recvd threshold"<<* (float*) recved.data().dptr_<Response(req_meta); stored.WaitToRead(); } -// std::cout<<"Server: Finished push"< response; @@ -502,7 +484,7 @@ class KVStoreDistServer { /** * \brief decomp_buf_ is a buffer into which compressed values are - * decompressed before merging to the store + * decompressed before merging to the store. used when compress_!='none' */ std::unordered_map decomp_buf_; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 99bd4a8d98e6..82b8700b44ab 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -547,7 +547,7 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) { } void Quantize(const NDArray &from, NDArray *to, NDArray *residual, - const NDArray &pos_threshold, const NDArray &neg_threshold, + const float neg_threshold, const float pos_threshold, std::string& compress, int priority) { CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; @@ -558,23 +558,20 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, int b = to->ctx().dev_mask(); std::vector const_vars; const_vars.push_back(from.var()); - const_vars.push_back(pos_threshold.var()); - const_vars.push_back(neg_threshold.var()); std::vector mutable_vars; mutable_vars.push_back(ret.var()); mutable_vars.push_back(res.var()); - std::vector inputs(5); + std::vector inputs(3); inputs[0] = from.data(); inputs[1] = residual->data(); - inputs[2] = neg_threshold.data(); - inputs[3] = pos_threshold.data(); - inputs[4] = to->data(); + inputs[2] = to->data(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([inputs](RunContext ctx) { - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); + Engine::Get()->PushSync([inputs, neg_threshold, pos_threshold](RunContext ctx) { + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 5028f82d1776..4ad7a3415ddc 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -190,8 +190,9 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i } template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Quantize2BitImpl(s,inputs); +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { + mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index 10c2ffb14468..cef2886c7e82 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -168,7 +168,8 @@ template void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); template -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold); template void ElementwiseSum(const std::vector source, diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index d98d0c040571..d199edb5ae15 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -41,6 +41,20 @@ struct init_mem_2bit { } }; +struct TwoBitParam : public dmlc::Parameter { + float pos_threshold, neg_threshold; + DMLC_DECLARE_PARAMETER(TwoBitParam) { + DMLC_DECLARE_FIELD(pos_threshold) + .set_default(0.1) + .describe("Threshold to quantize positive values. " + "Has to be greater than 0"); + DMLC_DECLARE_FIELD(neg_threshold) + .set_default(-0.1) + .describe("Threshold to quantize negative values. " + "Has to be less than 0"); + } +}; + template void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -48,11 +62,9 @@ void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { // For now, this method can only compress the float data - using namespace mshadow; - using namespace mxnet_op; - Stream *s = ctx.get_stream(); + mshadow::Stream *s = ctx.get_stream(); // Init the memory of output to 0x00000000 - Kernel::Launch(s, outputs[0].Size(), + mxnet_op::Kernel::Launch(s, outputs[0].Size(), outputs[0].dptr()); // compressed array } @@ -73,9 +85,9 @@ inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, return true; } -inline bool Create2BitArray2BitType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { +inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { // 0. input array CHECK_EQ(in_attrs->size(), 1U); // 0. output array @@ -90,13 +102,13 @@ inline bool Create2BitArray2BitType(const nnvm::NodeAttrs& attrs, struct init_threshold_2bit { MSHADOW_XINLINE static void Map(int i, float *out, - const float *neg_threshold, - const float *pos_threshold, + const float neg_threshold, + const float pos_threshold, int size) { - // The first two elments in output is threshold + // The first two elements in output are thresholds // The third element is the original size of the array - out[0] = *neg_threshold; - out[1] = *pos_threshold; + out[0] = neg_threshold; + out[1] = pos_threshold; out[2] = (float)size; } }; @@ -106,8 +118,8 @@ struct quantize_2bit { float *out, float *grad, float *residual, - const float *neg_threshold, - const float *pos_threshold) { + const float neg_threshold, + const float pos_threshold) { // Add residual to gradient grad[i] += residual[i]; // get block id @@ -119,9 +131,9 @@ struct quantize_2bit { // get column id int col_id = (i%16)%4; // Compress - if (grad[i] <= *neg_threshold) { // set data to 01 + if (grad[i] <= neg_threshold) { // set data to 01 // new residual - residual[i] = grad[i] - *neg_threshold; + residual[i] = grad[i] - neg_threshold; switch (col_id) { case 0: (*ch_ptr) |= 0x40; // binary: (01)00 0000 @@ -138,8 +150,8 @@ struct quantize_2bit { default: break; } - } else if (grad[i] >= *pos_threshold) { // set data to 10 - residual[i] = grad[i] - *pos_threshold; + } else if (grad[i] >= pos_threshold) { // set data to 10 + residual[i] = grad[i] - pos_threshold; switch (col_id) { case 0: (*ch_ptr) |= 0x80; // binary: (10)00 0000 @@ -163,36 +175,35 @@ struct quantize_2bit { }; template -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { - using namespace mshadow; - using namespace mxnet_op; +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { // First, init the memory of output to 0x00000000 - Kernel::Launch(s, inputs[4].Size(), - inputs[4].dptr()); // compressed array + mxnet_op::Kernel::Launch(s, inputs[2].Size(), + inputs[2].dptr()); // compressed array // Then, init threshold and original size - Kernel::Launch(s, 1, - inputs[4].dptr(), // compressed array - inputs[2].dptr(), // negative threshold - inputs[3].dptr(), // positive threshold - inputs[0].Size()); // original size + mxnet_op::Kernel::Launch(s, 1, + inputs[2].dptr(), // compressed array + neg_threshold, pos_threshold, + inputs[0].Size()); // Finally, compress the data and calculate new residual - Kernel::Launch(s, inputs[0].Size(), - inputs[4].dptr()+3, // compressed array + mxnet_op::Kernel::Launch(s, inputs[0].Size(), + inputs[2].dptr()+3, // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array - inputs[2].dptr(), // negative threshold - inputs[3].dptr()); // positive threshold + neg_threshold, // negative threshold + pos_threshold); // positive threshold } +// function defined as operator template void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - // For now, this method can only compress the float data mshadow::Stream *s = ctx.get_stream(); - Quantize2BitImpl(s, inputs); + const TwoBitParam& param = nnvm::get(attrs.parsed); + Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -200,20 +211,16 @@ inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { // 0. input array // 1. residual array - // 2. negative threshold - // 3. positive threshold - // 4. compressed array - CHECK_EQ(in_attrs->size(), 5U); + // 2. compressed array + CHECK_EQ(in_attrs->size(), 3U); CHECK(!shape_is_none(in_attrs->at(0))); CHECK(!shape_is_none(in_attrs->at(1))); CHECK_EQ(in_attrs->at(0).Size(), in_attrs->at(1).Size()); - CHECK(shape_is_scalar(in_attrs->at(2))); - CHECK(shape_is_scalar(in_attrs->at(3))); int shape = in_attrs->at(0).Size() % 16 == 0 ? in_attrs->at(0).Size() / 16 + 3: in_attrs->at(0).Size() / 16 + 4; - CHECK_EQ(in_attrs->at(4).Size(), shape) + CHECK_EQ(in_attrs->at(2).Size(), shape) << "The size of output array is not equal to " << "the size of compressed array"; return true; @@ -224,22 +231,14 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { // 0. input array // 1. residual array - // 2. negative threshold - // 3. positive threshold - // 4. compressed array - CHECK_EQ(in_attrs->size(), 5U); + // 2. compressed array + CHECK_EQ(in_attrs->size(), 3U); // check input CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) << "`quantize_2bit_` only supports float32 input for now"; CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) << "`quantize_2bit_` only supports float32 input for now"; CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) - << "the third input of `quantize_2bit` should be " - << "a tensor with type of float"; - CHECK_EQ((*in_attrs)[3], mshadow::kFloat32) - << "the fourth input of `quantize_2bit` should be " - << "a tensor with type of float"; - CHECK_EQ((*in_attrs)[4], mshadow::kFloat32) << "`quantize_2bit_` only supports float32 input for now"; return true; } diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index ea3e2e04770c..8fcaf647e9e8 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -25,7 +25,7 @@ namespace mxnet { namespace op { - +DMLC_REGISTER_PARAMETER(TwoBitParam); NNVM_REGISTER_OP(_contrib_quantize_2bit) .describe(R"code(Quantize a input tensor using 2-bit compression with residual array and user-specified threshold. @@ -49,30 +49,30 @@ original array will be compressed into a single element in the last element. In two bit compress, every 16 float data in original array will be packed into one float data in output array. )code" ADD_FILELINE) -.set_num_inputs(5) +.set_num_inputs(3) .set_num_outputs(0) +.set_attr_parser(ParamParser) .set_attr("FInferShape", Quantize2BitShape) .set_attr("FInferType", Quantize2BitType) .set_attr("FCompute", Quantize2BitCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) .set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{2,4}; + return std::vector{1,2}; }) .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("neg_threshold", "NDArray-or-Symbol", "The negative shreshold") -.add_argument("pos_shreshold", "NDArray-or-Symbol", "The positive shreshold") -.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); +.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_arguments(TwoBitParam::__FIELDS__()); NNVM_REGISTER_OP(_contrib_create_2bit) -.describe(R"code(Tp generate a compressed array with right shape. + .describe(R"code(Tp generate a compressed array with right shape. )code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(1) -.set_attr("FInferShape", Create2BitArrayShape) -.set_attr("FInferType", Create2BitArray2BitType) + .set_num_inputs(1) + .set_num_outputs(1) + .set_attr("FInferShape", Create2BitArrayShape) + .set_attr("FInferType", Create2BitArrayType) .set_attr("FCompute", Create2BitArrayCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_create_2bit"}) .add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); From 1fac41fb0b3fd5fca4f7e420874802385d4cfe88 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 9 Oct 2017 10:46:06 -0700 Subject: [PATCH 083/237] Added many checks for 0 --- src/kvstore/kvstore_dist.h | 39 ++++++++++++++++++++++- src/kvstore/kvstore_dist_server.h | 40 +++++++++++++++++++++++- src/ndarray/ndarray.cc | 5 +-- src/operator/contrib/two_bit_quantize.cc | 8 +++-- 4 files changed, 84 insertions(+), 8 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 635e2f13d718..0b9676b511de 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -303,9 +303,21 @@ namespace kvstore { // merge over devices int key = uniq_keys[i]; const auto &vals = grouped_vals[i]; + if (compress_!="none") { + vals[0].WaitToRead(); + for (int i = 0; i < vals[0].shape().Size(); i++) { + CHECK_EQ(*((float *) vals[0].data().dptr_ + i), 0); + } + } + NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); - + if (compress_!="none") { + merged.WaitToRead(); + for (int i = 0; i < merged.shape().Size(); i++) { + CHECK_EQ(*((float *) merged.data().dptr_ + i), 0); + } + } auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { // make sure the previous push/pull is completed @@ -321,6 +333,12 @@ namespace kvstore { } CopyFromTo(merged, &comm_buf); } + if (compress_!="none") { + comm_buf.WaitToRead(); + for (int i = 0; i < comm_buf.shape().Size(); i++) { + CHECK_EQ(*((float *) comm_buf.data().dptr_ + i), 0); + } + } if (compress_ != "none") { auto &small_buf = compr_buf_[key]; @@ -348,6 +366,7 @@ namespace kvstore { LOG(FATAL) << "compression for non default storage type unsupported"; } } else { + std::cout<<"About to push" <Slice(cur_from, end_part_data); Quantize(part_data, &part_compr, &part_res, neg_threshold_, pos_threshold_, compress_, priority); + part_compr.WaitToRead(); + + CHECK_EQ(*(float *) part_compr.data().dptr_,-0.5); + CHECK_EQ(*((float *) part_compr.data().dptr_+1),0.5); + for(int i=3; i>= 1; + } + + // Reverse the string since now it's backwards + std::string temp(str.rbegin(), str.rend()); + str = temp; + } /** * \brief executor runs a function using the thread called \ref Start */ @@ -393,10 +409,22 @@ class KVStoreDistServer { decomp_buf = NDArray(dshape, Context()); } } - + if(compress_!="none") { + CHECK_EQ(*((float *) recved.data().dptr_),-0.5); + CHECK_EQ(*((float *) recved.data().dptr_+1),0.5); + CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); + for(int i=3; iResponse(req_meta); stored.WaitToRead(); + if(compress_!="none") { + CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); + for(int i=0; ictx().dev_mask(); - std::vector const_vars; - const_vars.push_back(from.var()); - std::vector inputs(2); inputs[0] = from.data(); inputs[1] = to->data(); @@ -614,7 +611,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), const_vars, {ret.var()}, + }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { LOG(FATAL) << "Unsupported dequantization "<("FInferType", Dequantize2BitType) .set_attr("FCompute", Dequantize2BitCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_2bit"}) -.add_argument("input_1", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("input_2", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); +.set_attr("FMutateInputs", +[](const nnvm::NodeAttrs& attrs) { + return std::vector{1}; +}) +.add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); } // namespace op } // namespace mxnet From 1fdbdf0f78e6ec6203f6ee1ace13ac687fd5b63f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 9 Oct 2017 10:48:24 -0700 Subject: [PATCH 084/237] cmake changes --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 132b0e1c63d2..3bbcbaf20aad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,7 +184,7 @@ if(USE_OPENCV) message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}") message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})") add_definitions(-DMXNET_USE_OPENCV=1) - if(NOT MSVC) + if(NOT APPLE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-undefined,error") else() From c1a9adde8ff723599f348f5a7d0cf8e61a85ec5a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 9 Oct 2017 11:27:31 -0700 Subject: [PATCH 085/237] formatting issues for easier merge --- src/kvstore/kvstore_dist.h | 16 +++++----------- src/kvstore/kvstore_dist_server.h | 7 ++++--- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index ae4111039280..d60a3c70dc26 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -34,7 +34,6 @@ #include "./kvstore_dist_server.h" #include "../ndarray/ndarray_function.h" #include // for uint32_t - #if MKL_EXPERIMENTAL == 1 #include #include "../operator/mkl/mkl_memory-inl.h" @@ -204,6 +203,7 @@ namespace kvstore { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals); + for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; // use the same array for merging to guarantee that pull always happens @@ -227,10 +227,8 @@ namespace kvstore { #endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted - auto vals = new ps::SArray(data, size, false); // issue pull - CHECK_NOTNULL(ps_worker_)->ZPull( pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); }; @@ -381,10 +379,8 @@ namespace kvstore { auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys - size_t size = 0; - real_t* data = nullptr; - size = send_buf.shape().Size(); - data = send_buf.data().dptr(); + size_t size = send_buf.shape().Size(); + real_t* data = send_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif @@ -515,10 +511,8 @@ namespace kvstore { auto push_to_servers = [this, key, comm_buf, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys - size_t size = 0; - real_t* data = nullptr; - size = small_buf.shape().Size(); - data = small_buf.data().dptr(); + size_t size = small_buf.shape().Size(); + real_t* data = small_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(small_buf.data()); #endif diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index bd48a3f15bac..122966570c02 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -34,7 +34,7 @@ #include #include "ps/ps.h" #include "mxnet/kvstore.h" -#include "../operator/tensor/elemwise_binary_op.h" +#include "../operator/tensor/elemwise_binary_op-inl.h" #include "../operator/tensor/init_op.h" #include "../ndarray/ndarray_function.h" @@ -198,9 +198,9 @@ class KVStoreDistServer { if (merged->request.size() == (size_t) ps::NumWorkers()) { // let the main thread to execute updater_, which is necessary for python if (updater_) { - exec_.Exec([this, key, merged, stored](){ + exec_.Exec([this, key, merged, stored](){ CHECK(updater_); - updater_(key, merged->array, stored); + updater_(key, merged->array, stored); }); } else { // if no updater, just copy @@ -392,6 +392,7 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[0]); auto& stored = store_[key]; + // there used several WaitToRead, this is because \a recved's memory // could be deallocated when this function returns. so we need to make sure // the operators with \a NDArray are actually finished From 24bc3611488e2e7de7d4d3eb063b0a00e3e70a0b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 9 Oct 2017 11:29:42 -0700 Subject: [PATCH 086/237] fix rate --- tests/nightly/dist_sync_kvstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 5b2b0ae33eb5..e29e8913008b 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -53,7 +53,7 @@ def init_kv(): my_rank = kv.rank nworker = kv.num_workers # init updater on servers - kv.set_optimizer(mx.optimizer.create('test', rescale_grad=2)) + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) return kv, my_rank, nworker def init_kv_compressed(kv): @@ -191,7 +191,7 @@ def verify_residual(kv, pos_threshold, nworker): kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold - 0.4)) val2 = mx.nd.zeros(d[1]) kv.pull(d[0],val2) - curval = pos_threshold * 2 * nworker + curval = pos_threshold * rate * nworker check_diff_to_scalar(val2, curval) kv.push(d[0], mx.nd.ones(d[1])*0.2) val3= mx.nd.zeros(d[1]) @@ -200,7 +200,7 @@ def verify_residual(kv, pos_threshold, nworker): kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold-0.2)) val4 = mx.nd.zeros(d[1]) kv.pull(d[0],val4) - curval += pos_threshold*2*nworker + curval += pos_threshold*rate*nworker check_diff_to_scalar(val4, curval) def check_ones(kv, pos, nworker): @@ -211,7 +211,7 @@ def check_ones(kv, pos, nworker): kv.push('221',mx.nd.ones(big_shape)*pos*4) val2 = mx.nd.zeros(big_shape) kv.pull('221', val2) - newval = curval + 2*nworker*pos + newval = curval + rate*nworker*pos check_diff_to_scalar(val2, newval) def check_pull_before_push(kv): From 3a7985a1094a0ade16497abe557abfc9a70669c0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 9 Oct 2017 11:47:53 -0700 Subject: [PATCH 087/237] fix compilation errors after merge --- src/kvstore/kvstore_dist.h | 3 +-- src/ndarray/ndarray.cc | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index d60a3c70dc26..53fe4cf1bf36 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -318,7 +318,7 @@ namespace kvstore { } auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { - send_buf = merged; // avoid memory copy + comm_buf= merged; // avoid memory copy } else { if (comm_buf.is_none()) { if (storage_type == kDefaultStorage) { @@ -362,7 +362,6 @@ namespace kvstore { LOG(FATAL) << "compression for non default storage type unsupported"; } } else { - std::cout<<"About to push" <PushSync([inputs](RunContext ctx) { - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), {from.var()}, {ret.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), {from.var()}, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { - LOG(FATAL) << "Unsupported dequantization "< Date: Tue, 10 Oct 2017 13:25:24 -0700 Subject: [PATCH 088/237] fix compile error and ndarray thresholds in dequantize --- src/ndarray/ndarray.cc | 2 +- src/operator/contrib/two_bit_quantize-inl.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 167da3259c37..57edecf62707 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -634,7 +634,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), const_vars, {ret.var()}, + }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { LOG(FATAL) << "Unsupported dequantization "<(in+block_id); @@ -317,8 +317,8 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& input mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr()+3, // compressed array - inputs[0].dptr(), // negative threshold - inputs[0].dptr()+1); // positve threshold + *(inputs[0].dptr()), // negative threshold + *(inputs[0].dptr()+1)); // positve threshold } template From 8c6ba4f4e2dfdd0298d42d533120b82fd3a1ace1 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 13:28:44 -0700 Subject: [PATCH 089/237] fix compile error and ndarray thresholds in dequantize --- src/operator/contrib/two_bit_quantize-inl.h | 16 +++++++------- tests/nightly/dist_sync_kvstore.py | 24 +++++++++------------ 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index afabbb51dca7..fa57f64833af 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -263,10 +263,10 @@ struct dequantize_2bit { case 0: // positve if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 - out[i] = *pos_threshold; + out[i] = pos_threshold; // negative } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 - out[i] = *neg_threshold; + out[i] = neg_threshold; } else { // 0 out[i] = 0; } @@ -274,10 +274,10 @@ struct dequantize_2bit { case 1: // positve if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 - out[i] = *pos_threshold; + out[i] = pos_threshold; // negative } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 - out[i] = *neg_threshold; + out[i] = neg_threshold; } else { // 0 out[i] = 0; } @@ -285,10 +285,10 @@ struct dequantize_2bit { case 2: // positve if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 - out[i] = *pos_threshold; + out[i] = pos_threshold; // negative } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 - out[i] = *neg_threshold; + out[i] = neg_threshold; } else { // 0 out[i] = 0; } @@ -296,10 +296,10 @@ struct dequantize_2bit { case 3: // positve if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 - out[i] = *pos_threshold; + out[i] = pos_threshold; // negative } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 - out[i] = *neg_threshold; + out[i] = neg_threshold; } else { // 0 out[i] = 0; } diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index e29e8913008b..e3e2850aa5ea 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -228,22 +228,18 @@ def check_zero(kv): # print ('worker '+str(my_rank)+' started') - check_default_keys(kv, my_rank, nworker) - check_row_sparse_keys(kv, my_rank, nworker) - check_row_sparse_keys_with_zeros(kv, my_rank, nworker) - check_big_row_sparse_keys(kv, my_rank, nworker) + # check_default_keys(kv, my_rank, nworker) + # check_row_sparse_keys(kv, my_rank, nworker) + # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + # check_big_row_sparse_keys(kv, my_rank, nworker) # print('worker ' + str(my_rank) + ' is done with non compression tests') - # kv, pos, neg = init_kv_compressed(kv) - # check_pull_before_push(kv) - # check_zero(kv) - # verify_residual(kv, pos, nworker) - # check_ones(kv, pos, nworker) - # print('worker ' + str(my_rank) + ' is done with compression tests') -def test(): - val = mx.nd.zeros(big_shape) - check_diff_to_scalar(val,0) + kv, pos, neg = init_kv_compressed(kv) + check_pull_before_push(kv) + check_zero(kv) + verify_residual(kv, pos, nworker) + check_ones(kv, pos, nworker) + print('worker ' + str(my_rank) + ' is done with compression tests') if __name__ == "__main__": - # test() test_sync_push_pull() From 96fa9b3c3e50235155e3eef4ba273661f158585a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 13:31:24 -0700 Subject: [PATCH 090/237] fix compile error --- src/ndarray/ndarray.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 57edecf62707..b63ba6298f9d 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -594,7 +594,8 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs); + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { From baae59d6df0c52ac429e5640d6a62e3b61bbeaff Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 14:15:45 -0700 Subject: [PATCH 091/237] fix compile error, and add comments --- src/ndarray/ndarray.cc | 2 +- src/operator/contrib/two_bit_quantize-inl.h | 15 ++++++++---- src/operator/contrib/two_bit_quantize.cc | 27 ++++++++++++--------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index b63ba6298f9d..9b4ea1a1ec17 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -593,7 +593,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([inputs](RunContext ctx) { + Engine::Get()->PushSync([inputs, neg_threshold, pos_threshold](RunContext ctx) { mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index fa57f64833af..1a7104b487a4 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -122,6 +122,12 @@ struct quantize_2bit { const float pos_threshold) { // Add residual to gradient grad[i] += residual[i]; + + // Considers each float in the output array as forming a block + // Each block comprises a 4x4 grid. Each value in this grid + // refers to one float in the original grad array + // Only supports float32 + // get block id int block_id = i / 16; char* ch_ptr = reinterpret_cast(out+block_id); @@ -312,13 +318,12 @@ struct dequantize_2bit { template void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { - //using namespace mshadow; - // For now, this method can only decompress the float data + // Can only decompress the float32 data mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr()+3, // compressed array + inputs[1].dptr(), // out array + inputs[0].dptr()+3, // compressed array *(inputs[0].dptr()), // negative threshold - *(inputs[0].dptr()+1)); // positve threshold + *(inputs[0].dptr()+1)); // positive threshold } template diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index be5b24f71798..d02b9ee83766 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -27,27 +27,26 @@ namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(TwoBitParam); NNVM_REGISTER_OP(_contrib_quantize_2bit) -.describe(R"code(Quantize a input tensor using 2-bit compression with residual +.describe(R"code(Quantize an input tensor using 2-bit compression with residual array and user-specified threshold. -For example, assume the input array (gradient) is [-1.0, -5.0, -4.0], and the -residual is [-2.0, 0, 1.0], and the threshold is -4.0 and +4.0, respectively. +For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the +residual is [0.0, -2.0, 0, 1.0], and the threshold is -4.0 and +4.0, respectively. In this method, the elements (gradient + residual) >= pos_threshold will be compressed into a 2-bit data '01', and the elements <= neg_threshold will be compressed into a 2-bit data '10'. The other elements will be compressed -into '00', which is represented as zero. +into '00', which is represented as zero. Every 16 floats in the +original array will be packed into one float data in output array. In this example, invoke -quantize_2bit(array, residual, neg_threshold, pos_threshold, out), the 'out' +quantize_2bit(array, residual, out, neg_threshold, pos_threshold), the 'out' will be the compressed array. Note that, the out array can be generated by invoking create_2bit(array). In this example, the 'out' has 4 elements. The first element stores the -neg_threshold (-0.4) and the second element stores the pos_threshold (+0.4), the +neg_threshold (-4.0) and the second element stores the pos_threshold (+4.0), the third element stores the original size of the uncompressed array, and the original array will be compressed into a single element in the last element. -In two bit compress, every 16 float data in original array -will be packed into one float data in output array. )code" ADD_FILELINE) .set_num_inputs(3) .set_num_outputs(0) @@ -67,7 +66,7 @@ will be packed into one float data in output array. NNVM_REGISTER_OP(_contrib_create_2bit) - .describe(R"code(Tp generate a compressed array with right shape. + .describe(R"code(To generate a compressed array with right shape. )code" ADD_FILELINE) .set_num_inputs(1) .set_num_outputs(1) @@ -81,10 +80,14 @@ NNVM_REGISTER_OP(_contrib_dequantize_2bit) .describe(R"code(Dequantize a input tensor compressed by quantize_2bit. The dequantize_2bit takes two input arguments. The first input is a NDArray, -which has been generated by quantize_2bit(). The second input is also a -NDArray that has the same shape with the original array before compressing. +which has been generated by quantize_2bit(). This operator expects the first +three elements to be the negative threshold, positive threshold, and the size +of the original uncompressed array. Starting from the fourth element are +compressed values of the original array. +The second input is also a NDArray that has the same shape as +the original array before compressing. -Using the example as was described above. +Using the example as was described for quantize_2bit Invoke dequantize_2bit(out, array), the 'array' argument will become [0, -4.0, 0], where -4.0 is the negative threshold. From 2d5696ea2535dab085b8edade8bd74f3fe943f00 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 14:23:26 -0700 Subject: [PATCH 092/237] update operator comments --- src/operator/contrib/two_bit_quantize.cc | 28 +++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index d02b9ee83766..4697d5b41852 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -30,23 +30,25 @@ NNVM_REGISTER_OP(_contrib_quantize_2bit) .describe(R"code(Quantize an input tensor using 2-bit compression with residual array and user-specified threshold. +The quantized_2bit operator takes 5 arguments as follows: +quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. The `out` +variable will be the compressed array. Note that, the `out` array can be generated by +invoking `create_2bit(array)`. + For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the residual is [0.0, -2.0, 0, 1.0], and the threshold is -4.0 and +4.0, respectively. -In this method, the elements (gradient + residual) >= pos_threshold will be -compressed into a 2-bit data '01', and the elements <= neg_threshold will be -compressed into a 2-bit data '10'. The other elements will be compressed +In this method, the elements whose (gradient + residual) >= pos_threshold will be +compressed into 2-bits '01', and the elements whose +(gradient + residual) <= neg_threshold will be +compressed into 2-bits '10'. The other elements will be compressed into '00', which is represented as zero. Every 16 floats in the original array will be packed into one float data in output array. -In this example, invoke -quantize_2bit(array, residual, out, neg_threshold, pos_threshold), the 'out' -will be the compressed array. Note that, the out array can be generated by -invoking create_2bit(array). - -In this example, the 'out' has 4 elements. The first element stores the +In this example, 'out' has 4 elements. The first element stores the neg_threshold (-4.0) and the second element stores the pos_threshold (+4.0), the third element stores the original size of the uncompressed array, and the original array will be compressed into a single element in the last element. +The residual is also updated to [1.0, -3.0, -1.0, -3.0]. )code" ADD_FILELINE) .set_num_inputs(3) .set_num_outputs(0) @@ -61,7 +63,7 @@ original array will be compressed into a single element in the last element. }) .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("compressed_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_arguments(TwoBitParam::__FIELDS__()); @@ -77,9 +79,9 @@ NNVM_REGISTER_OP(_contrib_create_2bit) .add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); NNVM_REGISTER_OP(_contrib_dequantize_2bit) -.describe(R"code(Dequantize a input tensor compressed by quantize_2bit. +.describe(R"code(Dequantize an input tensor compressed by quantize_2bit. -The dequantize_2bit takes two input arguments. The first input is a NDArray, +The dequantize_2bit operator takes two input arguments. The first input is a NDArray, which has been generated by quantize_2bit(). This operator expects the first three elements to be the negative threshold, positive threshold, and the size of the original uncompressed array. Starting from the fourth element are @@ -90,7 +92,7 @@ the original array before compressing. Using the example as was described for quantize_2bit Invoke dequantize_2bit(out, array), the 'array' argument will become -[0, -4.0, 0], where -4.0 is the negative threshold. +[4.0, 0, -4.0, 0], where -4.0 is the negative threshold. )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(0) From 36e1b5117117276a9603b5bda01971a83d0dd1d0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 14:45:15 -0700 Subject: [PATCH 093/237] comment checks --- src/kvstore/kvstore_dist.h | 36 ++++++++++++++++----------------- src/ndarray/ndarray_function.cu | 3 ++- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 53fe4cf1bf36..d8b1441e31d7 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -301,21 +301,21 @@ namespace kvstore { // merge over devices int key = uniq_keys[i]; const auto &vals = grouped_vals[i]; - if (compress_!="none") { - vals[0].WaitToRead(); - for (int i = 0; i < vals[0].shape().Size(); i++) { - CHECK_EQ(*((float *) vals[0].data().dptr_ + i), 0); - } - } +// if (compress_!="none") { +// vals[0].WaitToRead(); +// for (int i = 0; i < vals[0].shape().Size(); i++) { +// CHECK_EQ(*((float *) vals[0].data().dptr_ + i), 0); +// } +// } NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); - if (compress_!="none") { - merged.WaitToRead(); - for (int i = 0; i < merged.shape().Size(); i++) { - CHECK_EQ(*((float *) merged.data().dptr_ + i), 0); - } - } +// if (compress_!="none") { +// merged.WaitToRead(); +// for (int i = 0; i < merged.shape().Size(); i++) { +// CHECK_EQ(*((float *) merged.data().dptr_ + i), 0); +// } +// } auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { comm_buf= merged; // avoid memory copy @@ -329,12 +329,12 @@ namespace kvstore { } CopyFromTo(merged, &comm_buf); } - if (compress_!="none") { - comm_buf.WaitToRead(); - for (int i = 0; i < comm_buf.shape().Size(); i++) { - CHECK_EQ(*((float *) comm_buf.data().dptr_ + i), 0); - } - } +// if (compress_!="none") { +// comm_buf.WaitToRead(); +// for (int i = 0; i < comm_buf.shape().Size(); i++) { +// CHECK_EQ(*((float *) comm_buf.data().dptr_ + i), 0); +// } +// } if (compress_ != "none") { auto &small_buf = compr_buf_[key]; diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 6071027ffee1..52c5d86f40a7 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -209,7 +209,8 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i } template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { +void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { mxnet::op::Quantize2BitImpl(s,inputs); } From f73e46389c8aa322f8b9f4ec5617ff60683f51b0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 14:49:34 -0700 Subject: [PATCH 094/237] comment checks --- src/kvstore/kvstore_dist.h | 12 ++++++------ src/kvstore/kvstore_dist_server.h | 32 +++++++++++++++---------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index d8b1441e31d7..90af580b386c 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -544,7 +544,7 @@ namespace kvstore { size_t cur_from = 0; //should be start of meta in new small_buf size_t cur_to = 0; - for(int i=0; iSlice(cur_to, cur_to+pskv.lens[i]); @@ -562,11 +562,11 @@ namespace kvstore { Quantize(part_data, &part_compr, &part_res, neg_threshold_, pos_threshold_, compress_, priority); part_compr.WaitToRead(); - CHECK_EQ(*(float *) part_compr.data().dptr_,-0.5); - CHECK_EQ(*((float *) part_compr.data().dptr_+1),0.5); - for(int i=3; iResponse(req_meta); stored.WaitToRead(); - if(compress_!="none") { - CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); - for(int i=0; i Date: Tue, 10 Oct 2017 14:52:01 -0700 Subject: [PATCH 095/237] compile error --- src/ndarray/ndarray_function.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 52c5d86f40a7..332303c22255 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -211,7 +211,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s,inputs); + mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); } } // namespace ndarray From 8fd1cdeaa19f554ad8145158676e1fd69f7f3cb3 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 10 Oct 2017 15:17:56 -0700 Subject: [PATCH 096/237] working on local kvstore compress test --- tests/nightly/dist_sync_kvstore.py | 1 - tests/nightly/test_kvstore.py | 36 +++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index e3e2850aa5ea..a5f37d57d180 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -207,7 +207,6 @@ def check_ones(kv, pos, nworker): val = mx.nd.zeros(big_shape) kv.pull('221', val) curval = val[0][0].asnumpy()[0] - print(curval) kv.push('221',mx.nd.ones(big_shape)*pos*4) val2 = mx.nd.zeros(big_shape) kv.pull('221', val2) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 081bc9c5a456..19ce92cea391 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -22,6 +22,10 @@ import mxnet as mx import numpy as np +def check_diff_to_scalar(A, x, rank=None): + """ assert A == x""" + assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x) + keys = [3, 5, 7] # let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND shapes = [(4, 4), (100, 100), (2000, 2000)]; @@ -55,9 +59,29 @@ def test_kvstore(kv_type): err = sum(err) / np.sum(np.abs(res[j])) assert(err < 1e-6), (err, shapes[j]) -test_kvstore('local_update_cpu') -test_kvstore('local_allreduce_cpu') -test_kvstore('local_allreduce_device') +def test_compress_kvstore(kv_type, compress='2bit', neg=-0.5, pos=0.5): + print(kv_type) + kv = mx.kv.create(kv_type) + kv.set_compress({'compress':compress, 'neg_threshold':neg, 'pos_threshold':pos}) + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=2)) + for k, s in zip(keys, shapes): + kv.init(k, mx.nd.zeros(s)) + # data = [[np.(s)*2-1 for i in range(nworker)] for s in shapes] + def pull_before_push(kv): + for j in range(len(keys)): + out = [mx.nd.zeros(shapes[j], mx.gpu(g))+1 for g in range(nworker)] + check_diff_to_scalar(out, 1) + kv.pull(keys[j], out=out) + check_diff_to_scalar(out, 0) + # err = [np.sum(np.abs(o.asnumpy() - res[j])) for o in out] + # err = sum(err) / np.sum(np.abs(res[j])) + # assert(err < 1e-6), (err, shapes[j]) + + pull_before_push(kv) +test_compress_kvstore('local_update_cpu') +# test_kvstore('local_update_cpu') +# test_kvstore('local_allreduce_cpu') +# test_kvstore('local_allreduce_device') ## group keys interface def test_group_kvstore(kv_type): @@ -79,6 +103,6 @@ def test_group_kvstore(kv_type): err = sum(err) / np.sum(np.abs(a)) assert(err < 1e-6), (err, a.shape) -test_group_kvstore('local_update_cpu') -test_group_kvstore('local_allreduce_cpu') -test_group_kvstore('local_allreduce_device') +# test_group_kvstore('local_update_cpu') +# test_group_kvstore('local_allreduce_cpu') +# test_group_kvstore('local_allreduce_device') From a0c2a2abc68acb15c116976684ddb5c85340ac49 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 10 Oct 2017 19:10:12 -0700 Subject: [PATCH 097/237] fix module api compressparams, and change quantize tblob to inside engine --- python/mxnet/module/module.py | 3 ++- src/kvstore/comm.h | 4 +-- src/ndarray/ndarray.cc | 30 +++++++++++++-------- src/operator/contrib/two_bit_quantize-inl.h | 1 + 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index cbebba23f873..72dca1130ab0 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -62,7 +62,7 @@ class Module(BaseModule): """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), logger=logging, context=ctx.cpu(), work_load_list=None, - fixed_param_names=None, state_names=None): + fixed_param_names=None, state_names=None, compress_params=None): super(Module, self).__init__(logger=logger) if isinstance(context, ctx.Context): @@ -99,6 +99,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), self._aux_params = None self._params_dirty = False + self._compress_params = compress_params if compress_params else {'compress':'none'} self._optimizer = None self._kvstore = None self._update_on_kvstore = None diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index e6c1a83253dd..e71ebed7bc08 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -530,12 +530,12 @@ class CommDevice : public Comm { for (size_t i = 0; i < src.size(); ++i) { buf.copy_buf[i] = NDArray( buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); - if (compress_.compare("none") != 0) { + if (compress_ != "none") { buf.residual[i] = NDArray( buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); buf.residual[i] = 0; int bits; - if (compress_.compare("2bit") == 0) { + if (compress_ =="2bit") { bits = 16; long int small_size = buf.merged.shape().Size() % bits == 0 ? buf.merged.shape().Size() / bits + 3 : diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 9b4ea1a1ec17..bb3c2c2545dc 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -573,15 +573,14 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::vector mutable_vars; mutable_vars.push_back(ret.var()); mutable_vars.push_back(res.var()); - - std::vector inputs(3); - inputs[0] = from.data(); - inputs[1] = residual->data(); - inputs[2] = to->data(); - + std::cout<<"going into loop"<PushSync([inputs, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs(3); + inputs[0] = from.data(); + inputs[1] = residual->data(); + inputs[2] = to->data(); mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, @@ -593,7 +592,16 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([inputs, neg_threshold, pos_threshold](RunContext ctx) { + std::cout<<"pushing to engine"<PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs(3); + inputs[0] = from.data(); + inputs[1] = residual->data(); + inputs[2] = to->data(); + for(int i=0; i<4; i++) { + CHECK_EQ(*(from.data().dptr()+i), 1.); + } + std::cout<<"passed checks"<(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, @@ -623,9 +631,9 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([inputs](RunContext ctx) { - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), {from.var()}, {ret.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); + }, from.ctx(), {from.var()}, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; } diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 1a7104b487a4..7f2b0619fd27 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -347,6 +347,7 @@ inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, // check input CHECK(!shape_is_none(in_attrs->at(0))); CHECK(!shape_is_none(in_attrs->at(1))); + //TODO(huilgolr) check CHECK_LE(in_attrs->at(1).Size(), in_attrs->at(0).Size()*16) << "The shape of the second input array are " From 52f47e59f7f31a9f7d071669f6ea53c122479e92 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 11 Oct 2017 06:50:39 +0000 Subject: [PATCH 098/237] 2bit arg wrong kvstore --- example/image-classification/common/fit.py | 3 ++- python/mxnet/module/module.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 812ad7462985..8bf081454df7 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -113,7 +113,8 @@ def fit(args, network, data_loader, **kwargs): data_loader : function that returns the train and val data iterators """ # kvstore - kv = mx.kvstore.create(args.kv_store, '2bit') + kv = mx.kvstore.create(args.kv_store) + kv.set_compress({'compress':'2bit','pos_threshold':0.5,'neg_threshold':-0.5}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 72dca1130ab0..5b5fe922a66c 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -485,7 +485,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', self._sync_params_from_devices() (kvstore, update_on_kvstore) = \ - _create_kvstore(kvstore, len(self._context), self._arg_params, '2bit') + _create_kvstore(kvstore, len(self._context), self._arg_params) batch_size = self._exec_group.batch_size if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type: From a33492477f7812ef06d84344e89bb3a78986ce25 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 10 Oct 2017 23:53:02 -0700 Subject: [PATCH 099/237] remove log --- src/ndarray/ndarray.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index bb3c2c2545dc..61f5d9b40ac3 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -573,7 +573,6 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::vector mutable_vars; mutable_vars.push_back(ret.var()); mutable_vars.push_back(res.var()); - std::cout<<"going into loop"<PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { From be8d01d39c230e25e82c8af7262ea646b956cdd8 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 11 Oct 2017 14:30:47 -0700 Subject: [PATCH 100/237] fix gpu dequantize and tests --- src/ndarray/ndarray.cc | 13 +-- src/operator/contrib/two_bit_quantize-inl.h | 24 +++--- tests/nightly/test_kvstore.py | 89 ++++++++++++++++----- tests/python/gpu/test_operator_gpu.py | 31 +++++++ tests/python/unittest/test_operator.py | 12 +-- 5 files changed, 128 insertions(+), 41 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 61f5d9b40ac3..725d7b34b4b1 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -624,12 +624,12 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri NDArray ret = *to; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); - std::vector inputs(2); - inputs[0] = from.data(); - inputs[1] = to->data(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([inputs](RunContext ctx) { + Engine::Get()->PushSync([from, to](RunContext ctx) { + std::vector inputs(2); + inputs[0] = from.data(); + inputs[1] = to->data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); @@ -640,7 +640,10 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([inputs](RunContext ctx) { + Engine::Get()->PushSync([from, to](RunContext ctx) { + std::vector inputs(2); + inputs[0] = from.data(); + inputs[1] = to->data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 7f2b0619fd27..de99b6d42795 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -254,8 +254,8 @@ struct dequantize_2bit { MSHADOW_XINLINE static void Map(int i, float *out, float *in, - const float neg_threshold, - const float pos_threshold) { + float *neg_threshold, + float *pos_threshold) { // get block ptr int block_id = i / 16; char* ch_ptr = reinterpret_cast(in+block_id); @@ -269,10 +269,10 @@ struct dequantize_2bit { case 0: // positve if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 - out[i] = pos_threshold; + out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 - out[i] = neg_threshold; + out[i] = *neg_threshold; } else { // 0 out[i] = 0; } @@ -280,10 +280,10 @@ struct dequantize_2bit { case 1: // positve if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 - out[i] = pos_threshold; + out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 - out[i] = neg_threshold; + out[i] = *neg_threshold; } else { // 0 out[i] = 0; } @@ -291,10 +291,10 @@ struct dequantize_2bit { case 2: // positve if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 - out[i] = pos_threshold; + out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 - out[i] = neg_threshold; + out[i] = *neg_threshold; } else { // 0 out[i] = 0; } @@ -302,10 +302,10 @@ struct dequantize_2bit { case 3: // positve if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 - out[i] = pos_threshold; + out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 - out[i] = neg_threshold; + out[i] = *neg_threshold; } else { // 0 out[i] = 0; } @@ -322,8 +322,8 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& input mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr()+3, // compressed array - *(inputs[0].dptr()), // negative threshold - *(inputs[0].dptr()+1)); // positive threshold + inputs[0].dptr(), // negative threshold + inputs[0].dptr()+1); // positive threshold } template diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 19ce92cea391..2f0f6c998b33 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -32,7 +32,7 @@ def check_diff_to_scalar(A, x, rank=None): lr = .1 nworker = 4 -nrepeat = 10 +nrepeat = 1 ## generate data data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)] @@ -44,7 +44,7 @@ def test_kvstore(kv_type): kv.set_optimizer(mx.optimizer.create('test', rescale_grad=lr)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) - + res = [np.zeros(s) for s in shapes] for i in range(nrepeat): for j in range(len(keys)): @@ -60,28 +60,81 @@ def test_kvstore(kv_type): assert(err < 1e-6), (err, shapes[j]) def test_compress_kvstore(kv_type, compress='2bit', neg=-0.5, pos=0.5): - print(kv_type) + print(kv_type, compress) + rate = 2 kv = mx.kv.create(kv_type) kv.set_compress({'compress':compress, 'neg_threshold':neg, 'pos_threshold':pos}) - kv.set_optimizer(mx.optimizer.create('test', rescale_grad=2)) + kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) - # data = [[np.(s)*2-1 for i in range(nworker)] for s in shapes] + def pull_before_push(kv): + for i in range(nrepeat): + for j in range(len(keys)): + out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j], out=out) + for o in out: + check_diff_to_scalar(o, 0) + + def push_zeros(kv): + for i in range(nrepeat): + for j in range(len(keys)): + kv.push(keys[j], [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]) + out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j], out=out) + for o in out: + check_diff_to_scalar(o, 0) + + def verify_residual(kv, neg_threshold, pos_threshold, rate): + for j in range(len(keys)): + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*0.4 for g in range(nworker)]) + out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j],out=out) + for o in out: + check_diff_to_scalar(o, 0) + + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.4) for g in range(nworker)]) + out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j],out=out) + curval = pos_threshold * rate * nworker + for o in out: + check_diff_to_scalar(o, curval) + + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(0.2) for g in range(nworker)]) + out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j],out=out) + for o in out: + check_diff_to_scalar(o, curval) + + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.2) for g in range(nworker)]) + out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j],out=out) + curval += pos_threshold*rate*nworker + for o in out: + check_diff_to_scalar(o, curval) + return curval + + def check_ones(kv, pos, rate, curval): + newval = curval + rate*nworker*pos for j in range(len(keys)): - out = [mx.nd.zeros(shapes[j], mx.gpu(g))+1 for g in range(nworker)] - check_diff_to_scalar(out, 1) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*pos*4 for g in range(nworker)]) + out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=out) - check_diff_to_scalar(out, 0) - # err = [np.sum(np.abs(o.asnumpy() - res[j])) for o in out] - # err = sum(err) / np.sum(np.abs(res[j])) - # assert(err < 1e-6), (err, shapes[j]) + for o in out: + check_diff_to_scalar(o, newval) + + pull_before_push(kv) + push_zeros(kv) + curval = verify_residual(kv, neg, pos, rate) + check_ones(kv, pos, rate, curval) + +test_kvstore('local_update_cpu') +test_kvstore('local_allreduce_cpu') +test_kvstore('local_allreduce_device') - pull_before_push(kv) test_compress_kvstore('local_update_cpu') -# test_kvstore('local_update_cpu') -# test_kvstore('local_allreduce_cpu') -# test_kvstore('local_allreduce_device') +test_compress_kvstore('local_allreduce_cpu') +test_compress_kvstore('local_allreduce_device') ## group keys interface def test_group_kvstore(kv_type): @@ -103,6 +156,6 @@ def test_group_kvstore(kv_type): err = sum(err) / np.sum(np.abs(a)) assert(err < 1e-6), (err, a.shape) -# test_group_kvstore('local_update_cpu') -# test_group_kvstore('local_allreduce_cpu') -# test_group_kvstore('local_allreduce_device') +test_group_kvstore('local_update_cpu') +test_group_kvstore('local_allreduce_cpu') +test_group_kvstore('local_allreduce_device') diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index c991e501220c..86ddf905ee35 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1445,6 +1445,37 @@ def test_cross_device_autograd(): assert_almost_equal(dx, x.grad.asnumpy()) +def test_two_bit_quantization_op(): + neg_threshold = -4.0 + pos_threshold = 4.0 + + grad = mx.nd.array([1.0, 1.0, 1.0], ctx=mx.gpu(0)) + residual = mx.nd.array([0.0, 0.0, 0.0], ctx=mx.gpu(0)) + compr = mx.contrib.nd.create_2bit(grad) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + decompr = mx.nd.zeros(grad.shape) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.zeros(grad.shape), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + + grad = mx.nd.array([3.0, 3.0, 3.0], ctx=mx.gpu(0)) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([0.0, 0.0, 0.0])) + + grad = mx.nd.array([1.0, 1.0, 1.0], ctx=mx.gpu(0)) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.zeros(grad.shape), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + + grad = mx.nd.array([6.0, 6.0, 6.0], ctx=mx.gpu(0)) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) + assert same(residual.asnumpy(), np.array([3.0, 3.0, 3.0])) + if __name__ == '__main__': import nose nose.runmodule() diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 33895a49b0ba..298ec25f4370 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3494,32 +3494,32 @@ def test_quantization_op(): assert same(a_.asnumpy(), a_real.asnumpy()) def test_two_bit_quantization_op(): - neg_threshold = mx.nd.array([-4.0]) - pos_threshold = mx.nd.array([4.0]) + neg_threshold = -4.0 + pos_threshold = 4.0 grad = mx.nd.array([1.0, 1.0, 1.0]) residual = mx.nd.array([0.0, 0.0, 0.0]) compr = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) assert same(np.zeros(grad.shape), decompr.asnumpy()) assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) grad = mx.nd.array([3.0, 3.0, 3.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) assert same(residual.asnumpy(), np.array([0.0, 0.0, 0.0])) grad = mx.nd.array([1.0, 1.0, 1.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) assert same(np.zeros(grad.shape), decompr.asnumpy()) assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) grad = mx.nd.array([6.0, 6.0, 6.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, neg_threshold, pos_threshold, compr) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) assert same(residual.asnumpy(), np.array([3.0, 3.0, 3.0])) From bb473a4229a05bd968f0530b3a9b1790f7a897da Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 11 Oct 2017 15:01:18 -0700 Subject: [PATCH 101/237] fix seg fault in quantize and test indent --- src/ndarray/ndarray.cc | 5 ----- tests/nightly/test_kvstore.py | 10 ++++------ 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 725d7b34b4b1..a4a4e2ef5351 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -591,16 +591,11 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - std::cout<<"pushing to engine"<PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs(3); inputs[0] = from.data(); inputs[1] = residual->data(); inputs[2] = to->data(); - for(int i=0; i<4; i++) { - CHECK_EQ(*(from.data().dptr()+i), 1.); - } - std::cout<<"passed checks"<(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 2f0f6c998b33..3e74e05af837 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -123,17 +123,15 @@ def check_ones(kv, pos, rate, curval): for o in out: check_diff_to_scalar(o, newval) - pull_before_push(kv) - push_zeros(kv) - curval = verify_residual(kv, neg, pos, rate) - check_ones(kv, pos, rate, curval) + pull_before_push(kv) + push_zeros(kv) + curval = verify_residual(kv, neg, pos, rate) + check_ones(kv, pos, rate, curval) test_kvstore('local_update_cpu') test_kvstore('local_allreduce_cpu') test_kvstore('local_allreduce_device') -test_compress_kvstore('local_update_cpu') -test_compress_kvstore('local_allreduce_cpu') test_compress_kvstore('local_allreduce_device') ## group keys interface From c8cfae53ce37963290d3c3c7f63b8c8798659573 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 11 Oct 2017 15:56:59 -0700 Subject: [PATCH 102/237] tests print more info order of params corrected --- src/operator/contrib/two_bit_quantize-inl.h | 8 +++---- tests/python/gpu/test_operator_gpu.py | 25 ++++++++++++++------- tests/python/unittest/test_operator.py | 25 ++++++++++++++------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index de99b6d42795..161be4a13507 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -44,14 +44,14 @@ struct init_mem_2bit { struct TwoBitParam : public dmlc::Parameter { float pos_threshold, neg_threshold; DMLC_DECLARE_PARAMETER(TwoBitParam) { - DMLC_DECLARE_FIELD(pos_threshold) - .set_default(0.1) - .describe("Threshold to quantize positive values. " - "Has to be greater than 0"); DMLC_DECLARE_FIELD(neg_threshold) .set_default(-0.1) .describe("Threshold to quantize negative values. " "Has to be less than 0"); + DMLC_DECLARE_FIELD(pos_threshold) + .set_default(0.1) + .describe("Threshold to quantize positive values. " + "Has to be greater than 0"); } }; diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 86ddf905ee35..048d3493f0de 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1455,26 +1455,35 @@ def test_two_bit_quantization_op(): mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.zeros(grad.shape), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + exp_residual = np.ones(grad.shape) + exp_grad = np.zeros(grad.shape) + assert same(np.zeros(grad.shape), decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([3.0, 3.0, 3.0], ctx=mx.gpu(0)) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([0.0, 0.0, 0.0])) + exp_grad = np.ones(grad.shape)*pos_threshold + exp_residual = np.zeros(grad.shape) + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(),exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([1.0, 1.0, 1.0], ctx=mx.gpu(0)) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.zeros(grad.shape), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + exp_grad = np.zeros(grad.shape) + exp_residual = np.ones(grad.shape) + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([6.0, 6.0, 6.0], ctx=mx.gpu(0)) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([3.0, 3.0, 3.0])) + exp_grad = np.ones(grad.shape)*pos_threshold + exp_residual = np.ones(grad.shape)*3 + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) + if __name__ == '__main__': import nose diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 298ec25f4370..127bb92cff3e 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3503,26 +3503,35 @@ def test_two_bit_quantization_op(): mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.zeros(grad.shape), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + exp_residual = np.ones(grad.shape) + exp_grad = np.zeros(grad.shape) + assert same(np.zeros(grad.shape), decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([3.0, 3.0, 3.0]) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([0.0, 0.0, 0.0])) + exp_grad = np.ones(grad.shape)*pos_threshold + exp_residual = np.zeros(grad.shape) + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(),exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([1.0, 1.0, 1.0]) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.zeros(grad.shape), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([1.0, 1.0, 1.0])) + exp_grad = np.zeros(grad.shape) + exp_residual = np.ones(grad.shape) + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) grad = mx.nd.array([6.0, 6.0, 6.0]) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - assert same(np.ones(grad.shape)*(pos_threshold.asnumpy()), decompr.asnumpy()) - assert same(residual.asnumpy(), np.array([3.0, 3.0, 3.0])) + exp_grad = np.ones(grad.shape)*pos_threshold + exp_residual = np.ones(grad.shape)*3 + assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) + assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) + def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 From e2b405ab0fc50fbbc1420ccbccb27d392ac10f06 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 12 Oct 2017 00:23:31 +0000 Subject: [PATCH 103/237] assert almost equal --- tests/nightly/test_kvstore.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 3e74e05af837..a8da266d34d8 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -21,6 +21,7 @@ sys.path.insert(0, "../../python/") import mxnet as mx import numpy as np +from mxnet.test_utils import assert_almost_equal def check_diff_to_scalar(A, x, rank=None): """ assert A == x""" @@ -32,7 +33,7 @@ def check_diff_to_scalar(A, x, rank=None): lr = .1 nworker = 4 -nrepeat = 1 +nrepeat = 10 ## generate data data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)] @@ -64,7 +65,7 @@ def test_compress_kvstore(kv_type, compress='2bit', neg=-0.5, pos=0.5): rate = 2 kv = mx.kv.create(kv_type) kv.set_compress({'compress':compress, 'neg_threshold':neg, 'pos_threshold':pos}) - kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) +# kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) @@ -73,8 +74,9 @@ def pull_before_push(kv): for j in range(len(keys)): out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=out) + exp = np.zeros_like(out[0].asnumpy()) for o in out: - check_diff_to_scalar(o, 0) + assert_almost_equal(o.asnumpy(), exp) def push_zeros(kv): for i in range(nrepeat): @@ -82,8 +84,9 @@ def push_zeros(kv): kv.push(keys[j], [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]) out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=out) + exp = np.zeros_like(out[0].asnumpy()) for o in out: - check_diff_to_scalar(o, 0) + assert_almost_equal(o.asnumpy(), exp) def verify_residual(kv, neg_threshold, pos_threshold, rate): for j in range(len(keys)): @@ -93,7 +96,7 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): for o in out: check_diff_to_scalar(o, 0) - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.4) for g in range(nworker)]) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.3) for g in range(nworker)]) out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j],out=out) curval = pos_threshold * rate * nworker @@ -125,12 +128,12 @@ def check_ones(kv, pos, rate, curval): pull_before_push(kv) push_zeros(kv) - curval = verify_residual(kv, neg, pos, rate) - check_ones(kv, pos, rate, curval) + #curval = verify_residual(kv, neg, pos, rate) + #check_ones(kv, pos, rate, curval) -test_kvstore('local_update_cpu') -test_kvstore('local_allreduce_cpu') -test_kvstore('local_allreduce_device') +#test_kvstore('local_update_cpu') +#test_kvstore('local_allreduce_cpu') +#test_kvstore('local_allreduce_device') test_compress_kvstore('local_allreduce_device') @@ -154,6 +157,6 @@ def test_group_kvstore(kv_type): err = sum(err) / np.sum(np.abs(a)) assert(err < 1e-6), (err, a.shape) -test_group_kvstore('local_update_cpu') -test_group_kvstore('local_allreduce_cpu') -test_group_kvstore('local_allreduce_device') +#test_group_kvstore('local_update_cpu') +#test_group_kvstore('local_allreduce_cpu') +#test_group_kvstore('local_allreduce_device') From 3ee92496d54a53a9ab2b5af2bfefcad8e9767220 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 12 Oct 2017 17:56:11 -0700 Subject: [PATCH 104/237] more debug stuff correct profiler message --- src/kvstore/comm.h | 50 ++++++++++++++++++++++++++++++++++--- src/kvstore/kvstore_local.h | 14 ++++++++++- src/ndarray/ndarray.cc | 8 +++--- 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index e71ebed7bc08..0e5548d886cf 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -495,9 +495,9 @@ class CommDevice : public Comm { int priority) override { // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore - if (src.size() == 1) { - return src[0]; - } +// if (src.size() == 1) { +// return src[0]; +// } if (!inited_) { std::vector devs; @@ -527,6 +527,7 @@ class CommDevice : public Comm { buf.residual.resize(src.size()); } + std::cout<<"merged ctx is "<())<())<())<<" "<<*(sendbufitemp.data().dptr()+1)<< + " "<<*(sendbufitemp.data().dptr()+2)<< " "<< *(sendbufitemp.data().dptr()+3)<()) + <<" "<<*(recvbufitemp.data().dptr()+1) + <<" "<<*(recvbufitemp.data().dptr()+2)<< " "<< *(recvbufitemp.data().dptr()+3)<()<< " for i="<())<())<())<& keys, @@ -203,6 +211,10 @@ class KVStoreLocal : public KVStore { for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; const NDArray& local = local_[key]; + NDArray localtemp = NDArray(local.shape(), pinned_ctx_, false, local.dtype()); + CopyFromTo(local, localtemp, 0); + localtemp.WaitToRead(); + std::cout<<"pull local "<<*(localtemp.data().dptr())<Broadcast(key, local, grouped_vals[i], priority); } diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a4a4e2ef5351..65dda57d76f8 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -583,9 +583,9 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); } else { - LOG(FATAL) << "Unsupported dequantization"; + LOG(FATAL) << "Unsupported Quantization"; } } else { #if MXNET_USE_CUDA @@ -599,9 +599,9 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); } else { - LOG(FATAL) << "Unsupported dequantization"; + LOG(FATAL) << "Unsupported Quantization"; } } else { LOG(FATAL) << "unknown device mask"; From 15e4f9cf48cbad44a0b5c7d427de5998f0fd3fba Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 13 Oct 2017 00:29:26 -0700 Subject: [PATCH 105/237] intermediate test rewrite --- tests/python/gpu/test_operator_gpu.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 048d3493f0de..eda59e00fb03 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -19,6 +19,7 @@ import os import time import unittest +import struct import mxnet as mx import numpy as np from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal @@ -1445,14 +1446,26 @@ def test_cross_device_autograd(): assert_almost_equal(dx, x.grad.asnumpy()) -def test_two_bit_quantization_op(): - neg_threshold = -4.0 - pos_threshold = 4.0 +def test_two_bit_quantization(): + def binary(num): + return ''.join(bin(ord(c)).replace('0b', '').rjust(8, '0') for c in struct.pack('!f', num)) - grad = mx.nd.array([1.0, 1.0, 1.0], ctx=mx.gpu(0)) - residual = mx.nd.array([0.0, 0.0, 0.0], ctx=mx.gpu(0)) + neg_threshold = -0.5 + pos_threshold = 0.5 + + orig_shape = (16) + + # push all 0s + grad = mx.nd.zeros(orig_shape, ctx=mx.gpu(0)) + residual = mx.nd.zeros(grad.shape, ctx=mx.gpu(0)) compr = mx.contrib.nd.create_2bit(grad) mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + f1 = binary(compr.asnumpy()[3]) + + + + + decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) exp_residual = np.ones(grad.shape) From 39f3bacfe6834c3d0787cd0ae1a4c4213c298ed4 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 13 Oct 2017 15:12:15 -0700 Subject: [PATCH 106/237] small change in pushing op to engineh --- src/ndarray/ndarray.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 65dda57d76f8..a5a0c0bc107d 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -575,11 +575,11 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, mutable_vars.push_back(res.var()); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs(3); inputs[0] = from.data(); - inputs[1] = residual->data(); - inputs[2] = to->data(); + inputs[1] = res.data(); + inputs[2] = ret.data(); mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, @@ -591,11 +591,11 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, residual, to, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs(3); inputs[0] = from.data(); - inputs[1] = residual->data(); - inputs[2] = to->data(); + inputs[1] = res.data(); + inputs[2] = ret.data(); mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), const_vars, mutable_vars, @@ -621,10 +621,10 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri int b = to->ctx().dev_mask(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to](RunContext ctx) { + Engine::Get()->PushSync([from, ret](RunContext ctx) { std::vector inputs(2); inputs[0] = from.data(); - inputs[1] = to->data(); + inputs[1] = ret.data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); @@ -635,10 +635,10 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to](RunContext ctx) { + Engine::Get()->PushSync([from, ret](RunContext ctx) { std::vector inputs(2); inputs[0] = from.data(); - inputs[1] = to->data(); + inputs[1] = ret.data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); From 50fa0fa98d74efa7046ef2b498ff63fdc9c7a267 Mon Sep 17 00:00:00 2001 From: Rahul Date: Mon, 16 Oct 2017 16:12:26 -0700 Subject: [PATCH 107/237] fix concurrency of quantization --- src/operator/contrib/two_bit_quantize-inl.h | 79 ++++++--------------- tests/python/gpu/test_operator_gpu.py | 51 ------------- tests/python/unittest/test_operator.py | 1 + 3 files changed, 24 insertions(+), 107 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 161be4a13507..d7cc2d87ccbb 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -37,7 +37,7 @@ namespace op { struct init_mem_2bit { // Initialize output array MSHADOW_XINLINE static void Map(int i, float* out) { - *reinterpret_cast(out+i) = 0x00000000; + *(out+i) = 0; } }; @@ -114,68 +114,34 @@ struct init_threshold_2bit { }; struct quantize_2bit { - MSHADOW_XINLINE static void Map(int i, + MSHADOW_XINLINE static void Map(int block_id, + int gradsize, float *out, float *grad, float *residual, const float neg_threshold, const float pos_threshold) { - // Add residual to gradient - grad[i] += residual[i]; - // Considers each float in the output array as forming a block - // Each block comprises a 4x4 grid. Each value in this grid - // refers to one float in the original grad array - // Only supports float32 + float* out_block = out + block_id; + // start and end are indices in original grad array + int start = block_id*16; + int end = ( start + 16 <= gradsize) ? start+16 : gradsize; - // get block id - int block_id = i / 16; - char* ch_ptr = reinterpret_cast(out+block_id); - // get row ptr - int row_id = (i%16)/4; - ch_ptr += row_id; - // get column id - int col_id = (i%16)%4; - // Compress - if (grad[i] <= neg_threshold) { // set data to 01 - // new residual - residual[i] = grad[i] - neg_threshold; - switch (col_id) { - case 0: - (*ch_ptr) |= 0x40; // binary: (01)00 0000 - break; - case 1: - (*ch_ptr) |= 0x10; // binary: 00(01) 0000 - break; - case 2: - (*ch_ptr) |= 0x04; // binary: 0000 (01)00 - break; - case 3: - (*ch_ptr) |= 0x01; // binary: 0000 00(01) - break; - default: - break; - } - } else if (grad[i] >= pos_threshold) { // set data to 10 - residual[i] = grad[i] - pos_threshold; - switch (col_id) { - case 0: - (*ch_ptr) |= 0x80; // binary: (10)00 0000 - break; - case 1: - (*ch_ptr) |= 0x20; // binary: 00(10) 0000 - break; - case 2: - (*ch_ptr) |= 0x08; // binary: 0000 (10)00 - break; - case 3: - (*ch_ptr) |= 0x02; // binary: 0000 00(10) - break; - default: - break; + for (int i=start; i(out_block + i%4); + if (grad[i] >= pos_threshold) { + residual[i] = grad[i] - pos_threshold; + // set data to 10 + (*ch_ptr) |= (2u<<(6-((i%4)*2))); + } else if (grad[i] <= neg_threshold) { + residual[i] = grad[i] - neg_threshold; + // set data to 01 + (*ch_ptr) |= (1u<<(6-((i%4)*2))); + } else { + // leave data as 00 + residual[i] = grad[i]; } - } else { // else 00 - residual[i] = grad[i]; } } }; @@ -192,7 +158,8 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, neg_threshold, pos_threshold, inputs[0].Size()); // Finally, compress the data and calculate new residual - mxnet_op::Kernel::Launch(s, inputs[0].Size(), + mxnet_op::Kernel::Launch(s, inputs[2].Size()-3, + inputs[0].Size(), inputs[2].dptr()+3, // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index eda59e00fb03..fcf3aa83b2c4 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1446,57 +1446,6 @@ def test_cross_device_autograd(): assert_almost_equal(dx, x.grad.asnumpy()) -def test_two_bit_quantization(): - def binary(num): - return ''.join(bin(ord(c)).replace('0b', '').rjust(8, '0') for c in struct.pack('!f', num)) - - neg_threshold = -0.5 - pos_threshold = 0.5 - - orig_shape = (16) - - # push all 0s - grad = mx.nd.zeros(orig_shape, ctx=mx.gpu(0)) - residual = mx.nd.zeros(grad.shape, ctx=mx.gpu(0)) - compr = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - f1 = binary(compr.asnumpy()[3]) - - - - - - decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_residual = np.ones(grad.shape) - exp_grad = np.zeros(grad.shape) - assert same(np.zeros(grad.shape), decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([3.0, 3.0, 3.0], ctx=mx.gpu(0)) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.ones(grad.shape)*pos_threshold - exp_residual = np.zeros(grad.shape) - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(),exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([1.0, 1.0, 1.0], ctx=mx.gpu(0)) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.zeros(grad.shape) - exp_residual = np.ones(grad.shape) - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([6.0, 6.0, 6.0], ctx=mx.gpu(0)) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.ones(grad.shape)*pos_threshold - exp_residual = np.ones(grad.shape)*3 - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - if __name__ == '__main__': import nose diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 127bb92cff3e..f293fe3170e2 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4306,6 +4306,7 @@ def check(data, idx): assert (mx.nd.scatter_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [2, 3]]).all() + if __name__ == '__main__': import nose nose.runmodule() From 6bb14119ad75fc0edf086ddd564d242fa624febd Mon Sep 17 00:00:00 2001 From: Rahul Date: Mon, 16 Oct 2017 18:09:28 -0700 Subject: [PATCH 108/237] wait on kernel --- src/ndarray/ndarray.cc | 22 +++++++++++++-------- src/operator/contrib/two_bit_quantize-inl.h | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index a5a0c0bc107d..e37d7480b69a 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -598,6 +598,8 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, inputs[2] = ret.data(); mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); }, from.ctx(), const_vars, mutable_vars, FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); } else { @@ -626,8 +628,10 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri inputs[0] = from.data(); inputs[1] = ret.data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), {from.var()}, {ret.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; } @@ -639,12 +643,14 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri std::vector inputs(2); inputs[0] = from.data(); inputs[1] = ret.data(); - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), {from.var()}, {ret.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "Unsupported dequantization "<(ctx.get_stream(), inputs); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {ret.var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "Unsupported dequantization "<(out+i) = 0x00000000; + *(out+i) = 0x00000000; } }; From 69f9e1171b617e0c0d0cf82eb0fa2a6511d87b47 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 17 Oct 2017 14:46:36 -0700 Subject: [PATCH 109/237] updated tests and removed prints --- src/kvstore/comm.h | 43 -------- src/kvstore/kvstore_dist.h | 34 +++--- src/kvstore/kvstore_dist_server.h | 21 +++- src/kvstore/kvstore_local.h | 14 +-- src/ndarray/ndarray.cc | 2 - src/ndarray/ndarray_function.cc | 1 + src/operator/contrib/two_bit_quantize-inl.h | 42 ++++++- tests/nightly/dist_sync_kvstore.py | 70 ++++++++---- tests/nightly/test_kvstore.py | 20 ++-- tests/python/unittest/test_operator.py | 115 +++++++++++++++++++- 10 files changed, 250 insertions(+), 112 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 0e5548d886cf..6ac8c121598c 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -555,59 +555,16 @@ class CommDevice : public Comm { if (compress_=="none") { CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else if (compress_ == "2bit") { - - NDArray resbufitemp = NDArray(buf.residual[i].shape(), pinned_ctx(), false, buf.residual[i].dtype()); - CopyFromTo(buf.residual[i], resbufitemp, 0); - resbufitemp.WaitToRead(); - std::cout<())<())<())<<" "<<*(sendbufitemp.data().dptr()+1)<< - " "<<*(sendbufitemp.data().dptr()+2)<< " "<< *(sendbufitemp.data().dptr()+3)<()) - <<" "<<*(recvbufitemp.data().dptr()+1) - <<" "<<*(recvbufitemp.data().dptr()+2)<< " "<< *(recvbufitemp.data().dptr()+3)<()<< " for i="<())<())<>= 1; - } - - // Reverse the string since now it's backwards - std::string temp(str.rbegin(), str.rend()); - str = temp; - } +// void floatToBinary(float f, std::string& str) +// { +// union { float f; uint32_t i; } u; +// u.f = f; +// str.clear(); +// +// for (int i = 0; i < 32; i++) +// { +// if (u.i % 2) str.push_back('1'); +// else str.push_back('0'); +// u.i >>= 1; +// } +// +// // Reverse the string since now it's backwards +// std::string temp(str.rbegin(), str.rend()); +// str = temp; +// } class KVStoreDist : public KVStoreLocal { diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 242005696616..6f6f247e0b13 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -410,14 +410,16 @@ class KVStoreDistServer { decomp_buf = NDArray(dshape, Context()); } } -// if(compress_!="none") { + if(compress_!="none") { // CHECK_EQ(*((float *) recved.data().dptr_),-0.5); // CHECK_EQ(*((float *) recved.data().dptr_+1),0.5); // CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); -// for(int i=3; i())<& keys, @@ -211,10 +203,6 @@ class KVStoreLocal : public KVStore { for (size_t i = 0; i < uniq_keys.size(); ++i) { int key = uniq_keys[i]; const NDArray& local = local_[key]; - NDArray localtemp = NDArray(local.shape(), pinned_ctx_, false, local.dtype()); - CopyFromTo(local, localtemp, 0); - localtemp.WaitToRead(); - std::cout<<"pull local "<<*(localtemp.data().dptr())<Broadcast(key, local, grouped_vals[i], priority); } diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e37d7480b69a..3703e713fb41 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -628,8 +628,6 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri inputs[0] = from.data(); inputs[1] = ret.data(); mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 4ad7a3415ddc..057fd7796e15 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -193,6 +193,7 @@ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); + } } // namespace ndarray diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index d7cc2d87ccbb..d86e21d73376 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -41,6 +41,7 @@ struct init_mem_2bit { } }; + struct TwoBitParam : public dmlc::Parameter { float pos_threshold, neg_threshold; DMLC_DECLARE_PARAMETER(TwoBitParam) { @@ -122,27 +123,60 @@ struct quantize_2bit { const float neg_threshold, const float pos_threshold) { +// int num = 1; +// if(*(char *)&num == 1) +// { +// std::cout<<"Little-Endian"<(out_block); for (int i=start; i(out_block + i%4); + char* curr_ptr = ch_ptr + (i-start)/4; if (grad[i] >= pos_threshold) { residual[i] = grad[i] - pos_threshold; // set data to 10 - (*ch_ptr) |= (2u<<(6-((i%4)*2))); +// std::cout<<"or "<<(2u<<(6-((i%4)*2)))<>= 1; } + + // Reverse the string since now it's backwards + std::string temp(fstr.rbegin(), fstr.rend()); + fstr = temp; + +// floatToBinary3(*out_block, fstr); +// std::cout<=pos: + str_quant += '10' + new_residual.append(a - pos) + decompr.append(pos) + elif a<= neg: + str_quant += '01' + new_residual.append(a - neg) + decompr.append(neg) + else: + str_quant += '00' + new_residual.append(a) + decompr.append(0) + # append extra bits when size of array not a factor of 16 + if len(str_quant)%16 != 0: + str_quant += '0'*(16 - len(str_quant)%16) + + compr = [neg, pos, len(arr)] + # converts the string generated into integers 32chars at a time + i = 0 + while i Date: Tue, 17 Oct 2017 16:27:31 -0700 Subject: [PATCH 110/237] comment unnecessary stuff --- src/operator/contrib/two_bit_quantize-inl.h | 30 ++++++++++----------- tests/nightly/dist_sync_kvstore.py | 2 -- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index d86e21d73376..03bef5c28d04 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -159,21 +159,21 @@ struct quantize_2bit { } // std::cout<<*out_block<>= 1; - } - - // Reverse the string since now it's backwards - std::string temp(fstr.rbegin(), fstr.rend()); - fstr = temp; +// std::string fstr; +// union { float f; uint32_t i; } u; +// u.f = *out_block; +// fstr.clear(); +// +// for (int i = 0; i < 32; i++) +// { +// if (u.i % 2) fstr.push_back('1'); +// else fstr.push_back('0'); +// u.i >>= 1; +// } +// +// // Reverse the string since now it's backwards +// std::string temp(fstr.rbegin(), fstr.rend()); +// fstr = temp; // floatToBinary3(*out_block, fstr); // std::cout< Date: Wed, 18 Oct 2017 00:23:37 +0000 Subject: [PATCH 111/237] fix test --- tests/nightly/test_kvstore.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index e6962077eae4..05156b783a61 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -115,9 +115,7 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): curval += pos_threshold*rate*nworker for o in out: check_diff_to_scalar(o, curval) - return curval - - + return curval def check_ones(kv, pos, rate, curval): newval = curval + rate*nworker*pos From 71296f808d8206d2168ba8fdc3ac36aee69564b6 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 00:24:31 +0000 Subject: [PATCH 112/237] remove print --- src/kvstore/comm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 6ac8c121598c..ddaf5b4316c8 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -527,7 +527,6 @@ class CommDevice : public Comm { buf.residual.resize(src.size()); } - std::cout<<"merged ctx is "< Date: Tue, 17 Oct 2017 17:52:45 -0700 Subject: [PATCH 113/237] Update dist_sync_kvstore.py fix random dist sync test --- tests/nightly/dist_sync_kvstore.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 19f84edcb790..48ef077031f3 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -239,14 +239,16 @@ def check_compr_random(kv, pos, neg, nworker): # v = mx.nd.zeros(d[1][0]*d[1][1]) grad = mx.nd.array(rnd.rand(d[1][0], d[1][1])) + grad_cpy = mx.nd.array(grad) kv.push(d[0], grad) val = mx.nd.zeros(d[1]) kv.pull(d[0], val) expected_diff = val - orig_val - - compr = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, mx.nd.zeros(d[1]), compr, neg, pos) + + # use copy because push modifies grad + compr = mx.contrib.nd.create_2bit(grad_cpy) + mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(d[1]), compr, neg, pos) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) From 3234aa49eb643f1e11a91fbb703c3248ea8c1d27 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 17 Oct 2017 19:39:20 -0700 Subject: [PATCH 114/237] remove slow kernel launch init --- src/operator/contrib/two_bit_quantize-inl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 03bef5c28d04..28e248a9bf73 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -134,6 +134,10 @@ struct quantize_2bit { // } float* out_block = out + block_id; + + // init to 0 + *out_block = 0; + // start and end are indices in original grad array int start = block_id*16; int end = ( start + 16 <= gradsize) ? start+16 : gradsize; @@ -184,8 +188,8 @@ template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { // First, init the memory of output to 0x00000000 - mxnet_op::Kernel::Launch(s, inputs[2].Size(), - inputs[2].dptr()); // compressed array +// mxnet_op::Kernel::Launch(s, inputs[2].Size(), +// inputs[2].dptr()); // compressed array // Then, init threshold and original size mxnet_op::Kernel::Launch(s, 1, inputs[2].dptr(), // compressed array From 72d28b63e37f22a769caa5f205e3c208c5addf95 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 05:26:53 +0000 Subject: [PATCH 115/237] cleanup --- src/operator/contrib/two_bit_quantize-inl.h | 41 +-------- tests/nightly/dist_sync_kvstore.py | 97 +++++++++++---------- tests/python/unittest/test_operator.py | 12 +-- 3 files changed, 57 insertions(+), 93 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 28e248a9bf73..e2dd507cb1e3 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -122,22 +122,9 @@ struct quantize_2bit { float *residual, const float neg_threshold, const float pos_threshold) { - -// int num = 1; -// if(*(char *)&num == 1) -// { -// std::cout<<"Little-Endian"<= pos_threshold) { residual[i] = grad[i] - pos_threshold; // set data to 10 -// std::cout<<"or "<<(2u<<(6-((i%4)*2)))<>= 1; -// } -// -// // Reverse the string since now it's backwards -// std::string temp(fstr.rbegin(), fstr.rend()); -// fstr = temp; - -// floatToBinary3(*out_block, fstr); -// std::cout< void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - // First, init the memory of output to 0x00000000 -// mxnet_op::Kernel::Launch(s, inputs[2].Size(), -// inputs[2].dptr()); // compressed array - // Then, init threshold and original size + // Init threshold and original size mxnet_op::Kernel::Launch(s, 1, inputs[2].dptr(), // compressed array neg_threshold, pos_threshold, diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 48ef077031f3..c7443d143856 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -178,98 +178,101 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): check_diff_to_scalar(val, expected, rank=my_rank) def check_compr_residual(kv, pos_threshold, nworker): - for d in [('21', shape),('2221',irregular_shape),('221', big_shape), ('21', shape)]: + for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: # doesn't meet threshold - kv.push(d[0], mx.nd.ones(d[1])*0.4) - val=mx.nd.zeros(d[1]) - kv.pull(d[0],val) + kv.push(k, mx.nd.ones(s)*0.4) + val=mx.nd.zeros(s) + kv.pull(k,val) check_diff_to_scalar(val, 0) # just meets threshold with residual - kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold - 0.4)) - val2 = mx.nd.zeros(d[1]) - kv.pull(d[0],val2) + kv.push(k, mx.nd.ones(s)*(pos_threshold - 0.4)) + val2 = mx.nd.zeros(s) + kv.pull(k,val2) curval = pos_threshold * rate * nworker check_diff_to_scalar(val2, curval) # doesn't meet threshold - kv.push(d[0], mx.nd.ones(d[1])*0.2) - val3= mx.nd.zeros(d[1]) - kv.pull(d[0], val3) + kv.push(k, mx.nd.ones(s)*0.2) + val3= mx.nd.zeros(s) + kv.pull(k, val3) check_diff_to_scalar(val3, curval) # exceeds again - kv.push(d[0], mx.nd.ones(d[1])*(pos_threshold-0.2)) - val4 = mx.nd.zeros(d[1]) - kv.pull(d[0],val4) + kv.push(k, mx.nd.ones(s)*(pos_threshold-0.2)) + val4 = mx.nd.zeros(s) + kv.pull(k,val4) curval += pos_threshold*rate*nworker check_diff_to_scalar(val4, curval) # residual is 0 now def check_compr_ones(kv, pos, nworker): - val = mx.nd.zeros(big_shape) - kv.pull('221', val) - curval = val[0][0].asnumpy()[0] - kv.push('221',mx.nd.ones(big_shape)*pos) - val2 = mx.nd.zeros(big_shape) - kv.pull('221', val2) - newval = curval + rate*nworker*pos - check_diff_to_scalar(val2, newval) - # residual = 0 again + for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + val = mx.nd.zeros(s) + kv.pull(k, val) + curval = val[0][0].asnumpy()[0] + kv.push(k,mx.nd.ones(s)*pos) + val2 = mx.nd.zeros(s) + kv.pull(k, val2) + newval = curval + rate*nworker*pos + check_diff_to_scalar(val2, newval) + # residual = 0 again def check_compr_pull_before_push(kv): - val = mx.nd.ones(irregular_shape) - kv.pull('2221', val) - check_diff_to_scalar(val, 0) + for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + print(k,s) + val = mx.nd.ones(s) + kv.pull(k, val) + check_diff_to_scalar(val, 0) def check_compr_zero(kv): - kv.push('221', mx.nd.zeros(big_shape)) - # to check that all are set to 0s - val = mx.nd.ones(big_shape) - kv.pull('221', val) - check_diff_to_scalar(val, 0) + for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + print(k,s) + kv.push(k, mx.nd.zeros(s)) + # to check that all are set to 0s + val = mx.nd.ones(s) + kv.pull(k, val) + check_diff_to_scalar(val, 0) def check_compr_random(kv, pos, neg, nworker): mx.random.seed(123) rnd.seed(123) nrepeat = 3 - for d in [('2221',irregular_shape),('221', big_shape), ('21', shape)]: - orig_val = mx.nd.zeros(d[1]) - kv.pull(d[0], orig_val) + for k,s in [('2221',irregular_shape),('221', big_shape), ('21', shape)]: + orig_val = mx.nd.zeros(s) + kv.pull(k, orig_val) - # v = mx.nd.zeros(d[1][0]*d[1][1]) - grad = mx.nd.array(rnd.rand(d[1][0], d[1][1])) + grad = mx.nd.array(rnd.rand(s[0], s[1])) grad_cpy = mx.nd.array(grad) - kv.push(d[0], grad) - val = mx.nd.zeros(d[1]) - kv.pull(d[0], val) + kv.push(k, grad) + val = mx.nd.zeros(s) + kv.pull(k, val) expected_diff = val - orig_val # use copy because push modifies grad compr = mx.contrib.nd.create_2bit(grad_cpy) - mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(d[1]), compr, neg, pos) + mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, neg, pos) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) decompr *= nworker * rate assert_almost_equal(expected_diff, decompr) - # print ('worker '+str(my_rank)+' started') - # check_default_keys(kv, my_rank, nworker) - # check_row_sparse_keys(kv, my_rank, nworker) - # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) - # check_big_row_sparse_keys(kv, my_rank, nworker) - # print('worker ' + str(my_rank) + ' is done with non compression tests') + # print ('worker '+str(my_rank)+' started') + # check_default_keys(kv, my_rank, nworker) + # check_row_sparse_keys(kv, my_rank, nworker) + # check_row_sparse_keys_with_zeros(kv, my_rank, nworker) + # check_big_row_sparse_keys(kv, my_rank, nworker) + # print('worker ' + str(my_rank) + ' is done with non compression tests') - # dont run non compressed keys after this + # dont run non compressed keys after this as kvstore now is set to compressed kv, pos, neg = init_kv_compressed(kv) check_compr_pull_before_push(kv) check_compr_zero(kv) check_compr_residual(kv, pos, nworker) check_compr_ones(kv, pos, nworker) check_compr_random(kv, pos, neg, nworker) - print('worker ' + str(my_rank) + ' is done with compression tests') if __name__ == "__main__": diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 2ccc175bdff9..235e2e0de0a4 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4308,8 +4308,8 @@ def check(data, idx): def test_two_bit_quantization(): neg_threshold = -0.5 pos_threshold = 0.5 - orig_shape = [(25,),(16,),(1121)] - num_repeat = 10 + orig_shape = [(144000)]#[(25,),(16,),(1121),(14400000)] + num_repeat = 1 from struct import pack,unpack # from bitstring import BitArray @@ -4413,10 +4413,10 @@ def random_small_range(shape): for i in range(num_repeat): data = [] data.append(zerodata(shape)) - data.append(onesdata(shape)) - data.append(random_data(shape)) - data.append(random_large_range(shape)) - data.append(random_small_range(shape)) + # data.append(onesdata(shape)) + # data.append(random_data(shape)) + # data.append(random_large_range(shape)) + # data.append(random_small_range(shape)) for d in data: check(d[0], d[1]) From 287e04092f96a9266b6ad386b6a097baf026b7c1 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 17 Oct 2017 22:48:38 -0700 Subject: [PATCH 116/237] undo changes in submodule --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index ada2f9fee50f..595d02c0e87b 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit ada2f9fee50f099465685ce6350f40817be45b53 +Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39 From 9290c23706763faa187957e2984ea98a9f220c0b Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 17 Oct 2017 22:50:42 -0700 Subject: [PATCH 117/237] submodule reset --- mshadow | 2 +- nnvm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mshadow b/mshadow index 2419833319ab..cb5c9872b542 160000 --- a/mshadow +++ b/mshadow @@ -1 +1 @@ -Subproject commit 2419833319ab89f597f8e2e386d800bd3af339fe +Subproject commit cb5c9872b542220be5b99f3aca0e1ff56e31b028 diff --git a/nnvm b/nnvm index 65a1a7104f8d..c86afa8f17a4 160000 --- a/nnvm +++ b/nnvm @@ -1 +1 @@ -Subproject commit 65a1a7104f8dca986c57765012555172239b31b1 +Subproject commit c86afa8f17a44bcd4e6eec41cd49ba87e4f7a635 From 99154c9a6d99624ad6ab737345b645340eac633e Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 17 Oct 2017 22:52:03 -0700 Subject: [PATCH 118/237] remove files --- example/image-classification/hosts | 5 ----- src/kvstore/.nfsefebc5260d811ddf0000001b | Bin 16384 -> 0 bytes 2 files changed, 5 deletions(-) delete mode 100644 example/image-classification/hosts delete mode 100644 src/kvstore/.nfsefebc5260d811ddf0000001b diff --git a/example/image-classification/hosts b/example/image-classification/hosts deleted file mode 100644 index ea9f2cc6c033..000000000000 --- a/example/image-classification/hosts +++ /dev/null @@ -1,5 +0,0 @@ -172.31.17.116 -172.31.34.134 -172.31.69.158 -172.31.67.2 - diff --git a/src/kvstore/.nfsefebc5260d811ddf0000001b b/src/kvstore/.nfsefebc5260d811ddf0000001b deleted file mode 100644 index 99035685faf8ad3ceb66ff3457274b938f054e0b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHNON<;>6)p3#i4zlyUUDQz z>X-UVdx&gH;UhkZkPv}IWL9igMPh-KRaQWVU=f+$0tuu@AcSBMA(D_7IQP}7{&~ih zl~qKQbX48%bL-yo?z^wvebp+h*Uquyg(nqUk1EPPwO?K=9W2ih&3wMJTkGqwz#?x4#g1My9X)Dkq3ciCR~;S}179z0ZUrdd$z3(f zAS`ru6PT8#45SP^00WOGM`xzR{5+gF#6ER!?E(6xO;ZL^22uu622uu622uu622uu6 z20rKvgx+4|28{TSG{lzoJ!AL(%Bv&Gow54=$oh-2{5NCucVztqSzeG2NS^batiK}b zzcyC?uB>m#`Zvbv|1Ikavc4(}O61Gp^p-M^GLSNmGLSNmGLSNmGLSNmGLSNmGLSOx zf60JmD9Q;~zMuBXIR8)f|93v2D6arFfJI;*aQlFwyZ~$fP2e%$5b(w$it@Xj?1i4dkAWM&2Jji+ z0I(0Z^)bi>T7U*D0*?dxfxmuKQGN^D0-gsh1FOIn0S4>^_5gQ2qA2eHKLc(8Uj|ly zIUoyUfX@M+1z!8GqI?@T3rqoTV_e<@{sH_QcpZ2V_&)GGzyT%z8o!5t_i-^?^JlB` zOKN?kSzlXT&U%5`bNx;3tGN>jRquA`t`meT2#s<%2z}G(9AUa`s4dsn&M|k3`@U&# z_N>C#mTMZU!5w4XwQbEYnr?&pXb@ygClsAbBM_$H*?Hg#SupW$IP zH+5=GgGBB_!E$+XL#TMhcOy@zDISru7sm&hIlMFYyBok|X3@wB^u#-&wjQ!#o<5mz4 zFpb)%7X@N~RSa7I-ekY5yi1#j)mjj{LftiZHkXhpWc6FK*;$q`oh{9R8%Yrv(m@RR zu>9EOwOHXJN!xq7gTXBxayVV$_KDY%pDmgWp86%8ZN<)~ws;$(t@5j8m^eq`wnRe0 z8U5bT;WyzeUTkunWilh9Uo2uaxgHCc0Mc1!bRyLk=()X^*Nw9zqw=g}P9r%#tc2rFCmK$*L zAH%kEF`ui>RyK3)LcQ8l(YLv>T2&Wnji$P~vY|H4S5_NUb-lVex6-I)@~o}F&m>bV zvxAY)wh?9`$GD%&x!Re^>T2bpI=8Y`Ux29TQmMoy#7zoJMl87#gX0l~O~ys>xQU?C zFVef*h%D|8jpTjemqL$-NAb`uP0c{{xlS>3doU>&D;9fQ_(NCoJnjU{h$!l^q~#YB zAy=r{2sJ;f1lx|D&4>@}E|&(rHgI(eiNkxM$I^Q20JA>VEHylpElG=Z_YTI&BjGO_ zVU!KNg`lk>_}Mu`zW%JrI+100p^w7YpVfYea6^98vrOF#6N$o?TCQucaj6RPq88eg z72VQ;V7LPmitCa>5td2*L=j-|!WI;u0%ryfHPZ^1>#)l{Z)3jFnsgTHlLeO-3{oF6 zd?FB2GFF>8h*T(f+HJAW#Y*9XK9r<6i!iBCL4aw%9etZ=wu|x6EDLf-6dE~$mL3S< z1~UV+=xN)5kfoUpjVXz6yu{?B+Z6YhEQ=r+LZE`%@ z#~<#SB77MI=Agpb20fxXgO$|YK>=p-(OjP7A+#|q5lD9CVSS=qC(^L|g`5)`@Isg+ zqzO?Z1(pO63?fyf!IxWDvD=J3==9B26rxss?5PwdCh#Hyr2q5UD28fKCL9+f zhw5d`7QBmI2cI4dF%S8+xtts^7S@hm9QaMXZnBg{=APJjonIV?R;S$a2xV{;7?cu^Lu7lFylK90A zB?fO}C19f=dn9z9Rf#hP4e!%R&c)NQZ;UvHX?qs8xf8}t-_inj3_RNlreG2$A3fbR zEsj7Z9VYo$Sh$mvjD>;wVm2%z6gdGGlO^!DPVYS-GSgny^bCpJ| zk!KsV=Gm3CCflHesotzr8*F8j&9BrKYR%e89sd?trGAlm25vj-(L5fWKX(~N0c5xV z=+YeOzUEk@r_Y~>OdmY95EMKLvRVraT3U}x&^kUx9U__OupT@D(T8FI31F-nhF%$~ zPOn$cFipF>;QE~+L=dY%ak(~Mtv9MuGli1u(gR1sM1Y!$Zp3VDo4GB7Tj+)`=iy&_ zxEbDZ=z$qIURbKww(RD(A*}#t4Lily^Q$`dLeiJsuq{(_bU`$1piTpVDnex@h345h zb{rH-p)SkH(eXLnCkzU?q?T`Xx*>CW4qA3i585{oQ!9-{ctmXTKo3%Qn0INqjL2~; zH4G5y;uR9wBDmVZt*IlJM-bv7$e@_%rtUF9Y8N zY=8sjfHS}`UYfR}(D0p9|?3TVJ3paPr*4g!BeKHwJcJfH$6 zfFr;sfk%ONq37GcTfiTIUjVNHKL^ML${nOv%0S9M%0S9M%0S9M%0S9M%D@McfjHek z*}nU1q!s0wXf>8GxvMluxhU5tBP?=i!+BYp1Tt1%s8Cw9f7GB-)1)vtArOT{=q2+y z{WA@*+j%BJdI-64bQFg|k>mQ{Cl9Tgr vI$J}!*dMV_j3rbaRNYJML{mCUjY&Z3ES>@13joKXG;TltX- From 52b6905b4ab792781356f4227ae9989f2f3f277e Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 17 Oct 2017 22:58:56 -0700 Subject: [PATCH 119/237] undo changes unrelated to project --- CMakeLists.txt | 31 +++------------------- example/image-classification/common/fit.py | 2 +- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5142727a375..092e8299423e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ endif() set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}") + include(cmake/Utils.cmake) mxnet_option(USE_OPENCV "Build with OpenCV support" ON) mxnet_option(USE_OPENMP "Build with Openmp support" ON) @@ -17,8 +18,7 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) -mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) -mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) +mxnet_option(USE_JEMALLOC "Build with Jemalloc support" OFF) mxnet_option(USE_PROFILER "Build with Profiler support" OFF) mxnet_option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF) mxnet_option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF) @@ -36,7 +36,6 @@ if("$ENV{VERBOSE}" STREQUAL "1") set(CMAKE_VERBOISE_MAKEFILE ON) endif() - if(MSVC) add_definitions(-DWIN32_LEAN_AND_MEAN) add_definitions(-DDMLC_USE_CXX11) @@ -90,9 +89,6 @@ if(USE_VTUNE) if(NOT VTUNE_ROOT) set(VTUNE_ROOT /opt/intel/vtune_amplifier_xe_2017) endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -g -pg") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer -g -pg") - set(CMAKE_LINK_LIBRARY_FILE_FLAG "${CMAKE_LINK_LIBRARY_FILE_FLAG} -g -pg") add_definitions(-DMXNET_USE_VTUNE=1) include_directories(${VTUNE_ROOT}/include) list(APPEND mxnet_LINKER_LIBS ${VTUNE_ROOT}/lib64/libittnotify.a) @@ -168,33 +164,16 @@ if(UNIX) endif() endif() -set(ALT_MALLOC_FLAGS "-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free") - -# ---[ gperftools -if(USE_GPERFTOOLS) - find_package(Gperftools) - if(GPERFTOOLS_FOUND) - message(STATUS "Using Gperftools malloc (tcmalloc)") - include_directories(${GPERFTOOLS_INCLUDE_DIR}) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALT_MALLOC_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALT_MALLOC_FLAGS}") - set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${GPERFTOOLS_LIBRARIES}) - set(USE_JEMALLOC 0) - endif() -endif() - # ---[ jemalloc if(USE_JEMALLOC) find_package(JeMalloc) if(JEMALLOC_FOUND) - message(STATUS "Using JEMalloc malloc") add_definitions(-DUSE_JEMALLOC) include_directories(${JEMALLOC_INCLUDE_DIRS}) set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${JEMALLOC_LIBRARIES}) endif() endif() -# ---[ OpenCV if(USE_OPENCV) find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found @@ -205,7 +184,7 @@ if(USE_OPENCV) message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}") message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})") add_definitions(-DMXNET_USE_OPENCV=1) - if(NOT APPLE) + if(NOT MSVC) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-undefined,error") else() @@ -217,7 +196,6 @@ else(USE_OPENCV) add_definitions(-DMXNET_USE_OPENCV=0) endif() -# ---[ OpenMP if(USE_OPENMP) find_package(OpenMP REQUIRED) if(OPENMP_FOUND) @@ -230,7 +208,6 @@ elseif(UNIX) list(APPEND mxnet_LINKER_LIBS pthread) endif() -# ---[ LAPack if(USE_LAPACK) add_definitions(-DMXNET_USE_LAPACK=1) list(APPEND mxnet_LINKER_LIBS lapack) @@ -271,7 +248,7 @@ endif() if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mshadow/cmake) add_subdirectory("mshadow") endif() -FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h") +FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h") FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh") # add nnvm to source diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 8bf081454df7..51a1abec7c48 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -114,7 +114,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_compress({'compress':'2bit','pos_threshold':0.5,'neg_threshold':-0.5}) + # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) From b560d25be681c5faa8bf2549c87e0d7dd4edb6d4 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 17 Oct 2017 23:00:33 -0700 Subject: [PATCH 120/237] undo changes unrelated to project --- CMakeLists.txt | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 092e8299423e..76ef5afa57ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,6 @@ endif() set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}") - include(cmake/Utils.cmake) mxnet_option(USE_OPENCV "Build with OpenCV support" ON) mxnet_option(USE_OPENMP "Build with Openmp support" ON) @@ -18,7 +17,8 @@ mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) -mxnet_option(USE_JEMALLOC "Build with Jemalloc support" OFF) +mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) +mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) mxnet_option(USE_PROFILER "Build with Profiler support" OFF) mxnet_option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF) mxnet_option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF) @@ -36,6 +36,7 @@ if("$ENV{VERBOSE}" STREQUAL "1") set(CMAKE_VERBOISE_MAKEFILE ON) endif() + if(MSVC) add_definitions(-DWIN32_LEAN_AND_MEAN) add_definitions(-DDMLC_USE_CXX11) @@ -89,6 +90,9 @@ if(USE_VTUNE) if(NOT VTUNE_ROOT) set(VTUNE_ROOT /opt/intel/vtune_amplifier_xe_2017) endif() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -g -pg") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer -g -pg") + set(CMAKE_LINK_LIBRARY_FILE_FLAG "${CMAKE_LINK_LIBRARY_FILE_FLAG} -g -pg") add_definitions(-DMXNET_USE_VTUNE=1) include_directories(${VTUNE_ROOT}/include) list(APPEND mxnet_LINKER_LIBS ${VTUNE_ROOT}/lib64/libittnotify.a) @@ -164,16 +168,33 @@ if(UNIX) endif() endif() +set(ALT_MALLOC_FLAGS "-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free") + +# ---[ gperftools +if(USE_GPERFTOOLS) + find_package(Gperftools) + if(GPERFTOOLS_FOUND) + message(STATUS "Using Gperftools malloc (tcmalloc)") + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALT_MALLOC_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALT_MALLOC_FLAGS}") + set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${GPERFTOOLS_LIBRARIES}) + set(USE_JEMALLOC 0) + endif() +endif() + # ---[ jemalloc if(USE_JEMALLOC) find_package(JeMalloc) if(JEMALLOC_FOUND) + message(STATUS "Using JEMalloc malloc") add_definitions(-DUSE_JEMALLOC) include_directories(${JEMALLOC_INCLUDE_DIRS}) set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${JEMALLOC_LIBRARIES}) endif() endif() +# ---[ OpenCV if(USE_OPENCV) find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs) if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found @@ -196,6 +217,7 @@ else(USE_OPENCV) add_definitions(-DMXNET_USE_OPENCV=0) endif() +# ---[ OpenMP if(USE_OPENMP) find_package(OpenMP REQUIRED) if(OPENMP_FOUND) @@ -208,6 +230,7 @@ elseif(UNIX) list(APPEND mxnet_LINKER_LIBS pthread) endif() +# ---[ LAPack if(USE_LAPACK) add_definitions(-DMXNET_USE_LAPACK=1) list(APPEND mxnet_LINKER_LIBS lapack) @@ -248,7 +271,7 @@ endif() if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mshadow/cmake) add_subdirectory("mshadow") endif() -FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h") +FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h") FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh") # add nnvm to source From 60b1b69bfd198da199552cdf1e61d751706dc833 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 02:31:29 -0700 Subject: [PATCH 121/237] Comments and cleanup. Remaining are src/kvstore, src/operator and tests --- cpp-package/include/mxnet-cpp/kvstore.h | 2 +- cpp-package/include/mxnet-cpp/kvstore.hpp | 4 +- include/mxnet/c_api.h | 12 ++-- include/mxnet/kvstore.h | 12 ++-- include/mxnet/ndarray.h | 24 ++++++- python/mxnet/gluon/trainer.py | 28 ++++---- python/mxnet/kvstore.py | 75 ++++++++++++++++++--- python/mxnet/kvstore_server.py | 2 +- python/mxnet/module/module.py | 9 +++ src/c_api/c_api.cc | 8 +-- src/kvstore/comm.h | 5 +- src/kvstore/kvstore_dist.h | 59 ++++++---------- src/kvstore/kvstore_local.h | 4 +- src/ndarray/ndarray.cc | 13 ++-- src/ndarray/ndarray_function.cc | 7 +- src/ndarray/ndarray_function.cu | 8 ++- src/ndarray/ndarray_function.h | 6 ++ src/operator/contrib/two_bit_quantize-inl.h | 36 +++++----- 18 files changed, 196 insertions(+), 118 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h index 3c5e445f30ba..45d7bfacacda 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.h +++ b/cpp-package/include/mxnet-cpp/kvstore.h @@ -37,7 +37,7 @@ class KVStore { public: static void SetType(const std::string& type); static void RunServer(); - static void SetCompress(const std::string& compress, const float pos_threshold, const float neg_threshold); + static void SetCompress(const std::string& compress, const float neg_threshold, const float pos_threshold); static void Init(int key, const NDArray& val); static void Init(const std::vector& keys, const std::vector& vals); static void Push(int key, const NDArray& val, int priority = 0); diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp index b32a6dcbc770..038c88745447 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.hpp +++ b/cpp-package/include/mxnet-cpp/kvstore.hpp @@ -83,9 +83,9 @@ inline void KVStore::RunServer() { } inline void KVStore::SetCompress(const std::string& compress, - const float pos_threshold, const float neg_threshold) { + const float neg_threshold, const float pos_threshold) { CHECK_EQ(MXKVStoreSetCompress(get_kvstore()->get_handle(), - compress.c_str(), pos_threshold, neg_threshold),0); + compress.c_str(), neg_threshold, pos_threshold),0); } inline void KVStore::Init(int key, const NDArray& val) { diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 68d73d830746..652a7d4167fc 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1532,17 +1532,17 @@ MXNET_DLL int MXKVStoreCreate(const char *type, KVStoreHandle *out); /*! - * \brief Set to use low-bit compression + * \brief Set parameters to use low-bit compressed gradients * \param handle handle to the kvstore - * \param compress set to use low-bit compression - * \param pos_threshold set the positive threshold in 2bit compress - * \param neg_threshold set the negative threshold in 2bit compress + * \param compress type of compression + * \param neg_threshold set the negative threshold for 2bit compression + * \param pos_threshold set the positive threshold for 2bit compression * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreSetCompress(KVStoreHandle handle, const char *compress, - const float pos_threshold, - const float neg_threshold); + const float neg_threshold, + const float pos_threshold); /*! * \brief Delete a KVStore handle. diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index a5c34f096958..9d3124a83d6a 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -65,10 +65,13 @@ class KVStore { inline const std::string& type() { return type_; } /** - * \brief set to use low-bit compression + * \brief Set parameters to use low-bit compressed gradients + * \param compress type of compression + * \param neg_threshold set the negative threshold for 2bit compression + * \param pos_threshold set the positive threshold for 2bit compression */ - virtual void SetCompress(const std::string& compress, const float pos_threshold, - const float neg_threshold) = 0; + virtual void SetCompress(const std::string& compress, const float neg_threshold, + const float pos_threshold) = 0; /*! * \brief Initialize a list of key-value pair to the store. @@ -394,7 +397,8 @@ class KVStore { std::string type_; /** - * \brief whether using low-bit compression + * \brief Specifies whether or not to use compressed gradients + * Can be `none` or `2bit` for now */ std::string compress_ = "none"; diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index bf11d62969e7..b26197d1b5a5 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -892,10 +892,30 @@ size_t num_aux_data(NDArrayStorageType stype); */ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, +/*! + * \brief Issue quantize operation to be scheduled by the engine + * Compresses `from` into `to` and accumulates the quantization error + * into 'residual' + * \param from the ndarray containing original data to be compressed + * \param to the target ndarray which contains compressed data + * \param residual the ndarray which accumulates quantization error + * \param compress type of compression + * \param neg_threshold negative threshold for 2bit quantization + * \param pos_threshold positive threshold for 2bit quantization + * \param priority Priority of the action. + */ +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& compress, const float neg_threshold, const float pos_threshold, - std::string& compress, int priority); + int priority); +/*! + * \brief Issue dequantize operation to be scheduled by the engine + * Dequantizes data in `from` into `to` + * \param from the ndarray containing compressed data + * \param to the target ndarray which contains original data + * \param compress type of compression + * \param priority Priority of the action. + */ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority); /*! diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index ec6b2c8d9775..227370bd38af 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -44,12 +44,14 @@ class Trainer(object): kvstore : str or KVStore kvstore type for multi-gpu and distributed training. See help on :any:`mxnet.kvstore.create` for more information. - compress : str - whether using low-bit compression. The argument can be 'none', '2bit', and '1bit'. - pos_threshold: - positive threshold used in 2bit compression. - neg_threshold: - negative threshold used in 2bit compression. + compress_params : dict + Specifies type of gradient compression and additional arguments depending + on the type of compression being used. + For example, 2bit compression requires a positive threshold and negative threshold. + So to completely the arguments for 2bit compression, we would need to pass + a dictionary like the following. + {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} + See mxnet.KVStore.set_compress method for more details on gradient compression. Properties ---------- @@ -72,14 +74,9 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) self._params.append(param) - if compress_params and compress_params.compress != 'none': - if (compress_params.compress != '2bit' and compress_params.compress != '1bit'): - raise ValueError("The compress argument can only be 'none', " \ - "'2bit', or '1bit'.") - if (compress_params.compress == '2bit' - and (compress_params.pos_threshold <= 0 or compress_params.neg_threshold >= 0)): - raise ValueError("The pos_threshold must be greater than 0, and " \ - "the neg_threshold must be less than 0.") + if compress_params : + if not isinstance(compress_params, dict): + raise ValueError("compress_params needs to be a dictionary") self._compress_params = compress_params if compress_params else {'compress':'none'} optimizer_params = optimizer_params if optimizer_params else {} self._scale = optimizer_params.get('rescale_grad', 1.0) @@ -116,8 +113,7 @@ def _init_optimizer(self, optimizer, optimizer_params): def _init_kvstore(self): arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params} - kvstore, update_on_kvstore = _create_kvstore(self._kvstore, - len(self._contexts), + kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) if kvstore: kvstore.set_compress(self._compress_params) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 8c033cdc7fd0..7877f9331b9e 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -349,14 +349,73 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): check_call(_LIB.MXKVStorePullRowSparse( self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) + def set_compress(self, compress_params={'compress':'none'}): + """ Specifies type of low-bit quantization for gradient compression if any, + and additional arguments depending on the type of compression being used. - def set_compress(self, compress_params={}): - """ Set to use low-bit compression - - compress can be 'none', '2bit', or '1bit'. + Parameters + ---------- + compress_params : dict + `compress_params` is a dictionary specifying the type and parameters + for gradient compression. The key `compress` in this dictionary is a required argument + and specifies the type of gradient compression. Other keys in this + dictionary are optional and specific to the type of gradient compression. + + 2bit Gradient Compression + --------- + 2bit gradient compression takes two thresholds, one for positive values and + other for negative thresholds. This works by limiting positive values in the + gradient to the positive threshold, and limiting negative values to the + negative threshold. Values which don't meet the thresholds are set to 0. + By doing so, each value in the gradient is in one of three states. 2bits are + used to represent these states, and every 16 float values in the original + gradient can be represented using one float. This compressed representation + can reduce communication costs. The difference between these values and + original values is stored at the sender's end as residual and added to the + gradient in the next iteration. + + When kvstore is 'local', gradient compression is used to reduce communication + between multiple devices (gpus). Gradient is quantized on each GPU which + computed the gradients, then sent to the GPU which merges the gradients. This + receiving GPU dequantizes the gradients and merges them. Note that this + increases memory usage on each GPU because of the residual array stored. + + When kvstore is 'dist', gradient compression is used to reduce communication + from worker to sender. Gradient is quantized on each worker which + computed the gradients, then sent to the server which dequantizes + this data and merges the gradients from each worker. Note that this + increases CPU memory usage on each worker because of the residual array stored. + Only worker to server communication is compressed in this setting. + If each machine has multiple GPUs, currently this GPU to GPU communication is + not compressed. Server to worker communication (in the case of pull) is also not + compressed. + + To use 2bit compression, we need to specify `compress` as `2bit`. + Only specifying `compress` would use default values + for the other arguments of thresholds. + To completely specify the arguments for 2bit compression, we would need to pass + a dictionary which includes `positive_threshold` and `negative_threshold` like: + {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} + compress: str + type of low-bit quantization to be used for gradient compression + Can only be '2bit' for now. + 2bit gradient compression uses 2bit quantization with residual to compress + gradients. It works by converts each value in the original gradient to use + 2 bits, causing size of gradient to be 1/16th of the original gradient + (and 3 floats of meta information). + pos_threshold: float + positive threshold used for 2bit quantization of gradients + Positive values in gradient above positive threshold will be set to + positive threshold. Positive values lesser than positive threshold will + be set to 0. + neg_threshold: float + negative threshold used for 2bit quantization of gradients + Negative values in gradient less than negative threshold will be set to + negative threshold. Negative values greater than negative threshold will + be set to 0. """ if 'compress' not in compress_params: - compress_params['compress'] = 'none' + raise ValueError('compress_params requires compress to be set') elif not isinstance(compress_params['compress'], string_types): raise TypeError('compress must be a string') elif compress_params['compress'] not in ['none','2bit']: @@ -375,15 +434,15 @@ def set_compress(self, compress_params={}): else: compress_params['neg_threshold'] = -0.1 - if (compress_params['pos_threshold'] <= 0 or compress_params['neg_threshold'] >= 0): + if compress_params['pos_threshold'] <= 0 or compress_params['neg_threshold'] >= 0: raise ValueError('pos_threshold needs to be greater than 0, \ and neg_threshold needs to be less than 0') if compress_params['compress'] == '2bit': check_call(_LIB.MXKVStoreSetCompress(self.handle, c_str(compress_params['compress']), - mx_float(compress_params['pos_threshold']), - mx_float(compress_params['neg_threshold']))) + mx_float(compress_params['neg_threshold']), + mx_float(compress_params['pos_threshold']))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py index 1bb995a45ca8..2504b4674a83 100644 --- a/python/mxnet/kvstore_server.py +++ b/python/mxnet/kvstore_server.py @@ -57,7 +57,7 @@ def server_controller(cmd_id, cmd_body, _): raise self.kvstore.set_optimizer(optimizer) else: - print ("server %d, unknown command (%d, %s)" % ( + print("server %d, unknown command (%d, %s)" % ( self.kvstore.rank, cmd_id, cmd_body)) return server_controller diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 5b5fe922a66c..310443a89aae 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -59,6 +59,15 @@ class Module(BaseModule): state_names : list of str states are similar to data and label, but not provided by data iterator. Instead they are initialized to 0 and can be set by `set_states()`. + compress_params : dict + Specifies type of gradient compression and additional arguments depending + on the type of compression being used. + For example, 2bit compression requires a positive threshold and negative threshold. + So to completely the arguments for 2bit compression, we would need to pass + a dictionary like the following. + {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} + See mxnet.KVStore.set_compress method for more details on gradient compression. + """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), logger=logging, context=ctx.cpu(), work_load_list=None, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 4f7d2a4cb8cf..76cf046d57d6 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -735,12 +735,12 @@ int MXKVStoreCreate(const char *type, int MXKVStoreSetCompress(KVStoreHandle handle, const char *compress, - const float pos_threshold, - const float neg_threshold) { + const float neg_threshold, + const float pos_threshold) { API_BEGIN(); static_cast(handle)->SetCompress(compress, - pos_threshold, - neg_threshold); + neg_threshold, + pos_threshold); API_END(); } diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index bc39f6a70f35..b4b3e904f7f5 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -48,7 +48,6 @@ class Comm { */ virtual void Init(int key, const NDArrayStorageType stype, const TShape& shape, int dtype = mshadow::kFloat32) = 0; - /** * \brief returns src[0] + .. + src[src.size()-1] */ @@ -554,8 +553,8 @@ class CommDevice : public Comm { if (compress_=="none") { CopyFromTo(src[i], &(buf.copy_buf[i]), priority); } else if (compress_ == "2bit") { - Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), - neg_threshold_, pos_threshold_, compress_, priority); + Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, + neg_threshold_, pos_threshold_, priority); CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); } else { diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index abdb06dfd90b..9e4fa1664bbe 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -113,9 +113,9 @@ namespace kvstore { } } - virtual void SetCompress(const std::string& compress, const float pos_threshold, - const float neg_threshold) override { - KVStoreLocal::SetCompress(compress, pos_threshold, neg_threshold); + virtual void SetCompress(const std::string& compress, const float neg_threshold, + const float pos_threshold) override { + KVStoreLocal::SetCompress(compress, neg_threshold, pos_threshold); if (get_rank() == 0) { SendCommandToServers(kSetCompress, compress_); } @@ -302,28 +302,15 @@ namespace kvstore { // merge over devices int key = uniq_keys[i]; const auto &vals = grouped_vals[i]; -// if (compress_!="none") { -// vals[0].WaitToRead(); -// for (int i = 0; i < vals[0].shape().Size(); i++) { -// CHECK_EQ(*((float *) vals[0].data().dptr_ + i), 0); -// } -// } - NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); -// if (compress_!="none") { -// merged.WaitToRead(); -// for (int i = 0; i < merged.shape().Size(); i++) { -// CHECK_EQ(*((float *) merged.data().dptr_ + i), 0); -// } -// } auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { - comm_buf= merged; // avoid memory copy // Start of a push doesn't guarantee that the previous pushes are completed. // This shouldn't affect training of networks though because training involves // a sequence of push, pull, then push. This imposes ordering that the // second push happens after the first pull, and the pull happens after first push. + comm_buf= merged; // avoid memory copy } else { if (comm_buf.is_none()) { if (storage_type == kDefaultStorage) { @@ -556,26 +543,10 @@ namespace kvstore { end_part_data = orig_size; } NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); -// for(int i=0; iSlice(cur_from, end_part_data); - Quantize(part_data, &part_compr, &part_res, neg_threshold_, pos_threshold_, compress_, priority); + Quantize(part_data, &part_compr, &part_res, compress_, neg_threshold_, pos_threshold_, priority); part_compr.WaitToRead(); -// CHECK_EQ(*(float *) part_compr.data().dptr_,-0.5); -// CHECK_EQ(*((float *) part_compr.data().dptr_+1),0.5); -// for(int i=3; i= bigarray_bound_) { @@ -619,8 +589,6 @@ namespace kvstore { original_size / bits + 3: original_size / bits + 4; } -// size = original_size; - mu_.lock(); PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; mu_.unlock(); @@ -782,23 +750,34 @@ namespace kvstore { return pskv; } - /** * \brief for worker to push and pull data */ ps::KVWorker* ps_worker_; + /** * \brief the server handle */ KVStoreDistServer* server_; + /** * \brief threshold for partition */ size_t bigarray_bound_; + + /** + * \brief buffer for non-compressed data + */ std::unordered_map comm_buf_; - /// \brief small buffer for quantize + + /** + * \brief buffer for compressed data + */ std::unordered_map compr_buf_; - /// \brief residual buffer for quantize + + /** + * \brief residual buffer to accumulate quantization error + */ std::unordered_map residual_; bool log_verbose_; diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 5efaa966ebbb..a4cece334470 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -135,8 +135,8 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } - virtual void SetCompress(const std::string& compress, const float pos_threshold, - const float neg_threshold) override { + virtual void SetCompress(const std::string& compress, const float neg_threshold, + const float pos_threshold) override { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 779cd973ff69..f263895032b4 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -558,9 +558,9 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { } } -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& compress, const float neg_threshold, const float pos_threshold, - std::string& compress, int priority) { + int priority) { CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; // important: callback must always capture by value @@ -568,11 +568,6 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, NDArray res = *residual; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); - std::vector const_vars; - const_vars.push_back(from.var()); - std::vector mutable_vars; - mutable_vars.push_back(ret.var()); - mutable_vars.push_back(res.var()); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { @@ -582,7 +577,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, inputs[2] = ret.data(); mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); - }, from.ctx(), const_vars, mutable_vars, + }, from.ctx(), {from.var()}, {ret.var(), res.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); } else { LOG(FATAL) << "Unsupported Quantization"; @@ -600,7 +595,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, neg_threshold, pos_threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); - }, from.ctx(), const_vars, mutable_vars, + }, from.ctx(), {from.var()}, {ret.var(), res.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); } else { LOG(FATAL) << "Unsupported Quantization"; diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 057fd7796e15..5860b24a42ac 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -184,16 +184,21 @@ void ElementwiseSum(mshadow::Stream* s, } } +/* + * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray + */ template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { mxnet::op::Dequantize2BitImpl(s,inputs); } +/* + * \brief Enables use of function defined under Quantize2Bit operator for an ndarray + */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); - } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index e59e77325c68..197ac1e7970f 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -202,12 +202,18 @@ void ElementwiseSum(mshadow::Stream* s, << nds[0].storage_type(); } } - + +/* + * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray + */ template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { mxnet::op::Dequantize2BitImpl(s,inputs); } +/* + * \brief Enables use of function defined under Quantize2Bit operator for an ndarray + */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index cef2886c7e82..37c22d1dd4f7 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -164,9 +164,15 @@ void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx); +/* + * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray + */ template void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); +/* + * \brief Enables use of function defined under Quantize2Bit operator for an ndarray + */ template void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold); diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index e2dd507cb1e3..4bf462aeaa33 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -116,33 +116,33 @@ struct init_threshold_2bit { struct quantize_2bit { MSHADOW_XINLINE static void Map(int block_id, - int gradsize, + int grad_size, float *out, float *grad, float *residual, const float neg_threshold, const float pos_threshold) { - float* out_block = out + block_id; + float* compr_block = out + block_id; // init to 0 - *out_block = 0; + *compr_block = 0; // start and end are indices in original grad array int start = block_id*16; - int end = ( start + 16 <= gradsize) ? start+16 : gradsize; - char* ch_ptr = reinterpret_cast(out_block); + int end = (start+16 <= grad_size) ? start+16 : grad_size; + char* block_ptr = reinterpret_cast(compr_block); for (int i=start; i= pos_threshold) { - residual[i] = grad[i] - pos_threshold; + char* curr_byte = block_ptr + (i-start)/4; + float curr_value = grad[i] + residual[i]; + if (curr_value >= pos_threshold) { + residual[i] = curr_value - pos_threshold; // set data to 10 - (*curr_ptr) |= (2u<<(6-((i%4)*2))); - } else if (grad[i] <= neg_threshold) { - residual[i] = grad[i] - neg_threshold; + (*curr_byte) |= (2u<<(6-((i%4)*2))); + } else if (curr_value <= neg_threshold) { + residual[i] = curr_value - neg_threshold; // set data to 01 - (*curr_ptr) |= (1u<<(6-((i%4)*2))); + (*curr_byte) |= (1u<<(6-((i%4)*2))); } else { // leave data as 00 - residual[i] = grad[i]; + residual[i] = curr_value; } } } @@ -158,15 +158,15 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, inputs[0].Size()); // Finally, compress the data and calculate new residual mxnet_op::Kernel::Launch(s, inputs[2].Size()-3, - inputs[0].Size(), + inputs[0].Size(), // original grad size inputs[2].dptr()+3, // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array - neg_threshold, // negative threshold - pos_threshold); // positive threshold + neg_threshold, // negative threshold + pos_threshold); // positive threshold } -// function defined as operator +// this function has been defined as quantize_2bit operator template void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, From e3153cec22e0ef5ba7d22c6af4780af2e5a17c2b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 04:23:51 -0700 Subject: [PATCH 122/237] more cleanup and comments --- src/kvstore/comm.h | 113 ++++++++++------ src/kvstore/kvstore_dist.h | 136 ++++++++++---------- src/kvstore/kvstore_dist_server.h | 69 ++-------- src/kvstore/kvstore_local.h | 2 +- src/operator/contrib/two_bit_quantize-inl.h | 1 - src/operator/contrib/two_bit_quantize.cc | 75 ++++++----- src/operator/contrib/two_bit_quantize.cu | 3 +- 7 files changed, 199 insertions(+), 200 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index b4b3e904f7f5..fc2b55e37aee 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -80,11 +80,12 @@ class Comm { } /** - * \brief set to use low-bit compression + * \brief Sets gradient compression parameters to be able to + * perform reduce with compressed gradients */ inline void SetCompress(const std::string& compress, - float const pos_threshold, - float const neg_threshold) { + const float neg_threshold, + const float pos_threshold) { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; @@ -92,8 +93,20 @@ class Comm { protected: Context pinned_ctx_; + + /* + * \brief Sets type of gradient compression + */ std::string compress_ = "none"; + + /* + * \brief sets positive threshold for 2bit gradient compression + */ float pos_threshold_; + + /* + * \brief sets negative threshold for 2bit gradient compression + */ float neg_threshold_; }; @@ -494,9 +507,9 @@ class CommDevice : public Comm { int priority) override { // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore -// if (src.size() == 1) { -// return src[0]; -// } + if (src.size() == 1 && compress_=="none") { + return src[0]; + } if (!inited_) { std::vector devs; @@ -512,57 +525,77 @@ class CommDevice : public Comm { auto& buf = merge_buf_[key]; std::vector reduce(src.size()); - if (buf.copy_buf.empty()) { - // TODO(mli) this results in large device memory usage for huge ndarray, - // such as the largest fullc in VGG. consider to do segment reduce with - // NDArray.Slice or gpu direct memory access. for the latter, we need to - // remove some ctx check, and also it reduces 20% perf - buf.copy_buf.resize(src.size()); + if (compress_=="none"){ + CopyFromTo(src[0], &(buf.merged), priority); + reduce[0] = buf.merged; - if (compress_!="none") { + if (buf.copy_buf.empty()) { + // TODO(mli) this results in large device memory usage for huge ndarray, + // such as the largest fullc in VGG. consider to do segment reduce with + // NDArray.Slice or gpu direct memory access. for the latter, we need to + // remove some ctx check, and also it reduces 20% perf + buf.copy_buf.resize(src.size()-1); + for (size_t i = 0; i < src.size()-1; ++i) { + buf.copy_buf[i] = NDArray( + buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); + } + } + for (size_t i = 0; i < src.size()-1; ++i) { + CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority); + reduce[i+1] = buf.copy_buf[i]; + } + } else { + if (buf.copy_buf.empty()) { // one buf for each context + buf.copy_buf.resize(src.size()); buf.small_recv_buf.resize(src.size()); buf.small_send_buf.resize(src.size()); buf.residual.resize(src.size()); - } - for (size_t i = 0; i < src.size(); ++i) { - buf.copy_buf[i] = NDArray( - buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); - if (compress_ != "none") { - buf.residual[i] = NDArray( - buf.merged.shape(), src[i].ctx(), false, buf.merged.dtype()); + for (size_t i = 0; i < src.size(); ++i) { + buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx(), + false, buf.merged.dtype()); + buf.residual[i] = NDArray(buf.merged.shape(), src[i].ctx(), + false, buf.merged.dtype()); buf.residual[i] = 0; - int bits; - if (compress_ =="2bit") { - bits = 16; + if (compress_ == "2bit") { + int bits = 16; long int small_size = buf.merged.shape().Size() % bits == 0 ? buf.merged.shape().Size() / bits + 3 : buf.merged.shape().Size() / bits + 4; - buf.small_recv_buf[i] = NDArray( - TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); - buf.small_send_buf[i] = NDArray( - TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); + buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), + false, buf.merged.dtype()); + buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), + false, buf.merged.dtype()); + } else { + LOG(FATAL) << "Unsupported type of compression "< #include #include -#include #include #include "./kvstore_local.h" #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" #include "../ndarray/ndarray_function.h" -#include // for uint32_t #if MKL_EXPERIMENTAL == 1 #include #include "../operator/mkl/mkl_memory-inl.h" @@ -52,34 +50,13 @@ namespace kvstore { * it's the server node's job to control the data consistency among all * workers. see details on \ref ServerHandle::Start */ - -// void floatToBinary(float f, std::string& str) -// { -// union { float f; uint32_t i; } u; -// u.f = f; -// str.clear(); -// -// for (int i = 0; i < 32; i++) -// { -// if (u.i % 2) str.push_back('1'); -// else str.push_back('0'); -// u.i >>= 1; -// } -// -// // Reverse the string since now it's backwards -// std::string temp(str.rbegin(), str.rend()); -// str = temp; -// } - - - class KVStoreDist : public KVStoreLocal { +class KVStoreDist : public KVStoreLocal { public: explicit KVStoreDist(bool use_device_comm) : KVStoreLocal(use_device_comm), ps_worker_(nullptr), server_(nullptr) { if (IsWorkerNode()) { ps_worker_ = new ps::KVWorker(0); ps::StartAsync("mxnet\0"); - //what happens during recovery? if (!ps::Postoffice::Get()->is_recovery()) { ps::Postoffice::Get()->Barrier( ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); @@ -119,9 +96,6 @@ namespace kvstore { if (get_rank() == 0) { SendCommandToServers(kSetCompress, compress_); } - //this fails. everyone just waits. why? -// Barrier(); -// ps::Postoffice::Get()->Barrier(ps::kWorkerGroup + ps::kServerGroup); } void Barrier() override { @@ -241,6 +215,7 @@ namespace kvstore { FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreDistDefaultPull")); + comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } } @@ -301,8 +276,9 @@ namespace kvstore { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; - const auto &vals = grouped_vals[i]; + const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; + const auto storage_type = merged.storage_type(); auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { @@ -310,7 +286,7 @@ namespace kvstore { // This shouldn't affect training of networks though because training involves // a sequence of push, pull, then push. This imposes ordering that the // second push happens after the first pull, and the pull happens after first push. - comm_buf= merged; // avoid memory copy + comm_buf = merged; // avoid memory copy } else { if (comm_buf.is_none()) { if (storage_type == kDefaultStorage) { @@ -329,11 +305,15 @@ namespace kvstore { PSKV &pskv = EncodeCompressedKey(key, original_size, true); // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { - // small buffer for quantize - small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); - // residual buffer for quantize - res_buf = NDArray(TShape{(long int) original_size}, comm_buf.ctx(), false, comm_buf.dtype()); - res_buf = 0; + if (storage_type == kDefaultStorage) { + // small buffer for quantize + small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); + // residual buffer for quantize + res_buf = NDArray(TShape{(long int) original_size}, comm_buf.ctx(), false, comm_buf.dtype()); + res_buf = 0; + } else { + LOG(FATAL) << "compression for non default storage type unsupported"; + } } if (compress_ == "2bit") { @@ -485,19 +465,24 @@ namespace kvstore { /** * \brief cache all key partitions + * + * `ps_kv_` is used for row sparse + * + * `push_ps_kv_` and `pull_ps_kv_`, used for default type gradients, are same + * when there is no gradient compression */ std::unordered_map ps_kv_; std::unordered_map push_ps_kv_; std::unordered_map pull_ps_kv_; + /** - * \brief serialize EncodeRowSparseKey and EncodeKey + * \brief serialize access to ps_kv_ or push_ps_kv_/pull_ps_kv_ while encoding keys */ std::mutex mu_; void PushCompressed(int key, NDArray& comm_buf, NDArray &small_buf, PSKV& pskv, int priority){ auto push_to_servers = [this, key, comm_buf, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { - // convert to ps keys size_t size = small_buf.shape().Size(); real_t* data = small_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 @@ -508,6 +493,8 @@ namespace kvstore { CHECK_NOTNULL(ps_worker_)->ZPush( pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); }; + // acquire locks on both comm_buf and small_buf so that pull (which uses comm_buf) + // for the same key waits till push finishes Engine::Get()->PushAsync( push_to_servers, pinned_ctx_, @@ -518,40 +505,43 @@ namespace kvstore { PROFILER_MESSAGE("KVStoreDistCompressedPush")); } - + /* + * \brief Compresses data by dividing original data into a part for each server, then + * quantizing each of these data blocks. The sizes of these parts come from pskv. + */ void Compress(NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, PSKV& pskv, int priority){ size_t orig_size = comm_buf.shape().Size(); + // to allow indexing parts for each server NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(long int) orig_size}); - int bits; + if (compress_ == "2bit") { - bits = 16; + //should be start of data in original commbuf + size_t cur_from = 0; + //should be start of meta in new small_buf + size_t cur_to = 0; + for (size_t i = 0; i < pskv.keys.size(); i++) { + NDArray part_compr = small_buf->Slice(cur_to, cur_to + pskv.lens[i]); + + // removing the 3 values from pskv length which are meta data + // end_part_data represents end of original data for this part + size_t end_part_data = cur_from + (pskv.lens[i] - 3) * 16; + // don't exceed original size + if (end_part_data > orig_size) { + end_part_data = orig_size; + } + NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); + NDArray part_res = res_buf->Slice(cur_from, end_part_data); + + Quantize(part_data, &part_compr, &part_res, compress_, neg_threshold_, pos_threshold_, priority); + + cur_from = end_part_data; + cur_to = cur_to + pskv.lens[i]; + } + CHECK_EQ(cur_from, orig_size); + CHECK_EQ(cur_to, small_buf->shape().Size()); } else { LOG(FATAL) << "Unsupported compression type"; } - //should be start of data in original commbuf - size_t cur_from = 0; - //should be start of meta in new small_buf - size_t cur_to = 0; - for(size_t i=0; iSlice(cur_to, cur_to+pskv.lens[i]); - // removing the 3 values from pskv length which are meta data - size_t end_part_data = cur_from + (pskv.lens[i] - 3 )* bits; - // don't exceed origin_size - if (end_part_data > orig_size) { - end_part_data = orig_size; - } - NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); - NDArray part_res = res_buf->Slice(cur_from, end_part_data); - Quantize(part_data, &part_compr, &part_res, compress_, neg_threshold_, pos_threshold_, priority); - part_compr.WaitToRead(); - - cur_from = end_part_data; - cur_to = cur_to + pskv.lens[i]; - } - CHECK_EQ(cur_from, orig_size); - CHECK_EQ(cur_to, small_buf->shape().Size()); } PSKV& EncodeKey(int key, size_t size, bool is_push) { @@ -564,8 +554,8 @@ namespace kvstore { /** * \brief Convert to keys in ps for compressed values - * \brief Divides original array into equal parts for each server - * with space for meta info + * Divides original array into equal parts for each server + * with 3 floats space for meta info */ inline PSKV& EncodeCompressedKey(int key, size_t original_size, bool is_push) { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); @@ -577,10 +567,14 @@ namespace kvstore { } else { LOG(FATAL)<<"Unsupported compression type"; } + // represents size of data to be sent size_t compr_size = 0; - // add 3 values as meta info + + // adds 3 values to both cases for meta info if (original_size >= bigarray_bound_) { + // if size of data is not divisible by bits, then we need an extra float + // to store the last few values compr_size = num_servers * ((original_size/num_servers) % bits == 0 ? (original_size/num_servers)/bits + 3 : (original_size/num_servers)/bits + 4); @@ -598,6 +592,8 @@ namespace kvstore { CHECK_EQ(static_cast(pskv.size), size)<< "The value size can't be changed"; } else { // populate both pull and push pskvs + // push pskv has sizes corresponding to compressed data + // pull pskv has decompressed sizes for parts in push_pskv mu_.lock(); PSKV& pull_pskv = pull_ps_kv_[key]; PSKV& push_pskv = push_ps_kv_[key]; @@ -627,6 +623,8 @@ namespace kvstore { if (part_orig + pskv.size > original_size) { part_orig = original_size - pskv.size; } + + // TODO(huilgolr) specific to 2bit compression. generalize size_t compr_split = (part_orig % bits == 0)? part_orig/bits + 3 : part_orig/bits + 4; ps::Key ps_key = krs[i].begin() + key; @@ -766,12 +764,16 @@ namespace kvstore { size_t bigarray_bound_; /** - * \brief buffer for non-compressed data + * \brief buffer for non-compressed data. + * When gradient compression is active, this is used + * for the data in pull and for original data in push */ std::unordered_map comm_buf_; /** * \brief buffer for compressed data + * Used when gradient compression is active and action + * is push */ std::unordered_map compr_buf_; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 6f6f247e0b13..044ef21b2c9f 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -31,7 +31,6 @@ #include #include #include -#include #include "ps/ps.h" #include "mxnet/kvstore.h" #include "../operator/tensor/elemwise_binary_op-inl.h" @@ -46,23 +45,7 @@ static const int kDefaultPushPull = 0; static const int kStopServer = -1; static const int kSyncMode = -2; static const int kSetCompress = 2; - void floatToBinary2(float f, std::string& str) - { - union { float f; uint32_t i; } u; - u.f = f; - str.clear(); - for (int i = 0; i < 32; i++) - { - if (u.i % 2) str.push_back('1'); - else str.push_back('0'); - u.i >>= 1; - } - - // Reverse the string since now it's backwards - std::string temp(str.rbegin(), str.rend()); - str = temp; - } /** * \brief executor runs a function using the thread called \ref Start */ @@ -404,30 +387,22 @@ class KVStoreDistServer { NDArray recved = NDArray(recv_blob, 0); NDArray decomp_buf = decomp_buf_[key]; if (compress_ != "none") { - long int original_size = (long int)(*(recv_blob.dptr()+2)); - dshape = TShape{original_size}; + if (compress_ == "2bit") { + long int original_size = (long int)(*(recv_blob.dptr()+2)); + // changing dshape to original size as value is stored in + // original size even when compressed data is received + dshape = TShape{original_size}; + } else { + LOG(FATAL) << "Unsupported compression type"; + } + // TODO(huilgolr) check and merge with init of stored if (decomp_buf.is_none()) { decomp_buf = NDArray(dshape, Context()); } } - if(compress_!="none") { -// CHECK_EQ(*((float *) recved.data().dptr_),-0.5); -// CHECK_EQ(*((float *) recved.data().dptr_+1),0.5); -// CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); -// std::cout<<"recvd "; -// for(int i=0; iResponse(req_meta); stored.WaitToRead(); -// if(compress_!="none") { -// CHECK_EQ(*((float *) recved.data().dptr_+2),dshape.Size()); -// for(int i=0; iInit(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } - comm_->SetCompress(compress_, pos_threshold_, neg_threshold_); + comm_->SetCompress(compress_, neg_threshold_, pos_threshold_); } virtual void PushImpl(const std::vector& keys, diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 4bf462aeaa33..b7128c562836 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -41,7 +41,6 @@ struct init_mem_2bit { } }; - struct TwoBitParam : public dmlc::Parameter { float pos_threshold, neg_threshold; DMLC_DECLARE_PARAMETER(TwoBitParam) { diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 4697d5b41852..0ad37d2cbab3 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -19,35 +19,47 @@ /*! * \file two_bit_quantize.cc - * \brief + * \brief registers quantize_2bit, dequantize_2bit + * and create_2bit operators with nnvm */ #include "./two_bit_quantize-inl.h" namespace mxnet { namespace op { + DMLC_REGISTER_PARAMETER(TwoBitParam); + NNVM_REGISTER_OP(_contrib_quantize_2bit) -.describe(R"code(Quantize an input tensor using 2-bit compression with residual -array and user-specified threshold. +.describe(R"code(Quantize an input tensor into using 2bits for each value using +user-specified thresholds, while storing quantization error in residual array. -The quantized_2bit operator takes 5 arguments as follows: -quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. The `out` -variable will be the compressed array. Note that, the `out` array can be generated by -invoking `create_2bit(array)`. +The quantize_2bit operator takes 5 arguments and is called as follows: +`quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. +The operator modifies `residual` and `out` arrays. +The `out`variable will be the quantized array. Note that, `out` array can be generated by +invoking `create_2bit(array)`, avoiding calculation of size of quantized array. +This `out` array has first three elements as negative threshold, positive threshold, +and size of the original uncompressed array. Any elements after these three elements +represent quantized data. +The operation sums up array and residual, and then +applies the thresholds to quantize the data into one of three states +represented by 2bits. 16 such quantized floats in the original array +are packed together into one float in the `out` array. +The quantization error is stored in residual array. For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the -residual is [0.0, -2.0, 0, 1.0], and the threshold is -4.0 and +4.0, respectively. -In this method, the elements whose (gradient + residual) >= pos_threshold will be -compressed into 2-bits '01', and the elements whose -(gradient + residual) <= neg_threshold will be -compressed into 2-bits '10'. The other elements will be compressed -into '00', which is represented as zero. Every 16 floats in the -original array will be packed into one float data in output array. +residual is [0.0, -2.0, 0, 1.0]. Let the negative and positive thresholds be +-4.0 and +4.0, respectively. In this method, the elements whose +(gradient + residual) >= pos_threshold will be quantized into 2-bits '01', +and the elements whose (gradient + residual) <= neg_threshold will be +quantized into 2-bits '10'. The other elements will be quantized +as '00'. Every 16 floats in the original array will be packed +into one float variable in the output array. In this example, 'out' has 4 elements. The first element stores the -neg_threshold (-4.0) and the second element stores the pos_threshold (+4.0), the +neg_threshold (-4.0), the second element stores the pos_threshold (+4.0), the third element stores the original size of the uncompressed array, and the -original array will be compressed into a single element in the last element. +original array will be quantized into a single element in the last element. The residual is also updated to [1.0, -3.0, -1.0, -3.0]. )code" ADD_FILELINE) .set_num_inputs(3) @@ -66,33 +78,33 @@ The residual is also updated to [1.0, -3.0, -1.0, -3.0]. .add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_arguments(TwoBitParam::__FIELDS__()); - NNVM_REGISTER_OP(_contrib_create_2bit) - .describe(R"code(To generate a compressed array with right shape. + .describe(R"code(Generate an array with the right shape to store the input data after +two bit quantization. This array will be on the same context as input array. )code" ADD_FILELINE) - .set_num_inputs(1) - .set_num_outputs(1) - .set_attr("FInferShape", Create2BitArrayShape) - .set_attr("FInferType", Create2BitArrayType) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr("FInferShape", Create2BitArrayShape) +.set_attr("FInferType", Create2BitArrayType) .set_attr("FCompute", Create2BitArrayCompute) .set_attr("FGradient", ElemwiseGradUseNone{"_create_2bit"}) .add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); NNVM_REGISTER_OP(_contrib_dequantize_2bit) -.describe(R"code(Dequantize an input tensor compressed by quantize_2bit. +.describe(R"code(Dequantize an input tensor quantized by quantize_2bit. The dequantize_2bit operator takes two input arguments. The first input is a NDArray, which has been generated by quantize_2bit(). This operator expects the first three elements to be the negative threshold, positive threshold, and the size -of the original uncompressed array. Starting from the fourth element are -compressed values of the original array. -The second input is also a NDArray that has the same shape as -the original array before compressing. +of the original uncompressed array. Starting from the fourth element are expected to +be quantized values of the original array. +The second input is a NDArray that has the same shape as the original +array before quantizing. The operator replaces the contents of this array +with dequantized data. -Using the example as was described for quantize_2bit - -Invoke dequantize_2bit(out, array), the 'array' argument will become -[4.0, 0, -4.0, 0], where -4.0 is the negative threshold. +In the example was described for quantize_2bit, +invoking dequantize_2bit(out, array), the 'array' argument will become +[4.0, 0, -4.0, 0], where -4.0 and 4.0 are the negative and positive thresholds. )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(0) @@ -106,6 +118,5 @@ Invoke dequantize_2bit(out, array), the 'array' argument will become }) .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); - } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu index 20f21a295ebd..b90ff1840771 100644 --- a/src/operator/contrib/two_bit_quantize.cu +++ b/src/operator/contrib/two_bit_quantize.cu @@ -19,7 +19,8 @@ /*! * \file two_bit_quantize_sim.cu - * \brief + * \brief registers quantize_2bit, dequantize_2bit + * and create_2bit operators for GPU */ #include "./two_bit_quantize-inl.h" From eeb454bdd8d8c059e3a96a5bee77c8758543613e Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 04:31:18 -0700 Subject: [PATCH 123/237] comments for tests --- tests/nightly/dist_sync_kvstore.py | 10 +++---- tests/nightly/test_kvstore.py | 5 ++-- tests/python/gpu/test_operator_gpu.py | 2 -- tests/python/unittest/test_operator.py | 41 +------------------------- 4 files changed, 8 insertions(+), 50 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 06fa5016f937..1eebe0028ac9 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -69,7 +69,6 @@ def init_kv_compressed(kv): def test_sync_push_pull(): kv, my_rank, nworker = init_kv() - def check_default_keys(kv, my_rank, nworker): nrepeat = 3 # checks pull after push in loop, because behavior during @@ -224,14 +223,12 @@ def check_compr_ones(kv, pos, nworker): def check_compr_pull_before_push(kv): for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: - print(k,s) val = mx.nd.ones(s) kv.pull(k, val) check_diff_to_scalar(val, 0) def check_compr_zero(kv): for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: - print(k,s) kv.push(k, mx.nd.zeros(s)) # to check that all are set to 0s val = mx.nd.ones(s) @@ -239,6 +236,8 @@ def check_compr_zero(kv): check_diff_to_scalar(val, 0) def check_compr_random(kv, pos, neg, nworker): + # set a seed so all workers generate same data. knowing this helps + # calculate expected value after pull mx.random.seed(123) rnd.seed(123) for k,s in [('2221',irregular_shape),('221', big_shape), ('21', shape)]: @@ -252,8 +251,7 @@ def check_compr_random(kv, pos, neg, nworker): kv.pull(k, val) expected_diff = val - orig_val - - # use copy because push modifies grad + compr = mx.contrib.nd.create_2bit(grad_cpy) mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, neg, pos) decompr = mx.nd.zeros(grad.shape) @@ -269,7 +267,7 @@ def check_compr_random(kv, pos, neg, nworker): check_big_row_sparse_keys(kv, my_rank, nworker) print('worker ' + str(my_rank) + ' is done with non compression tests') - # dont run non compressed keys after this as kvstore now is set to compressed + # don't run non compressed keys after this as kvstore now is set to compressed kv, pos, neg = init_kv_compressed(kv) check_compr_pull_before_push(kv) check_compr_zero(kv) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 05156b783a61..7a9f40169348 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -45,7 +45,7 @@ def test_kvstore(kv_type): kv.set_optimizer(mx.optimizer.create('test', rescale_grad=lr)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) - + res = [np.zeros(s) for s in shapes] for i in range(nrepeat): for j in range(len(keys)): @@ -61,7 +61,7 @@ def test_kvstore(kv_type): assert(err < 1e-6), (err, shapes[j]) def test_compress_kvstore(kv_type, compress='2bit', neg=-0.5, pos=0.5): - print(kv_type, compress) + print(kv_type + ' with ' + compress + ' compression') rate = 2 kv = mx.kv.create(kv_type) kv.set_compress({'compress':compress, 'neg_threshold':neg, 'pos_threshold':pos}) @@ -135,6 +135,7 @@ def check_ones(kv, pos, rate, curval): test_kvstore('local_allreduce_cpu') test_kvstore('local_allreduce_device') +# compression for local kvstore happens only when reduce is on device test_compress_kvstore('local_allreduce_device') ## group keys interface diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 2e6db46789e1..b1f43f31051d 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -19,7 +19,6 @@ import os import time import unittest -import struct import mxnet as mx import numpy as np import unittest @@ -1447,7 +1446,6 @@ def test_cross_device_autograd(): assert_almost_equal(dx, x.grad.asnumpy()) - if __name__ == '__main__': import nose nose.runmodule() diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 9b20e3bd5181..d0f648eb417a 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3536,46 +3536,6 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) -def test_two_bit_quantization_op(): - neg_threshold = -4.0 - pos_threshold = 4.0 - - grad = mx.nd.array([1.0, 1.0, 1.0]) - residual = mx.nd.array([0.0, 0.0, 0.0]) - compr = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_residual = np.ones(grad.shape) - exp_grad = np.zeros(grad.shape) - assert same(np.zeros(grad.shape), decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([3.0, 3.0, 3.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.ones(grad.shape)*pos_threshold - exp_residual = np.zeros(grad.shape) - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(),exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([1.0, 1.0, 1.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.zeros(grad.shape) - exp_residual = np.ones(grad.shape) - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - grad = mx.nd.array([6.0, 6.0, 6.0]) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - exp_grad = np.ones(grad.shape)*pos_threshold - exp_residual = np.ones(grad.shape)*3 - assert same(exp_grad, decompr.asnumpy()), (decompr.asnumpy(), exp_grad) - assert same(residual.asnumpy(), exp_residual), (residual.asnumpy(), exp_residual) - - def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors @@ -4728,6 +4688,7 @@ def test_softmax(): check_softmax_grad(default_context()) check_smoothed_softmax_grad(default_context()) + if __name__ == '__main__': import nose nose.runmodule() From 2f936ee62457ca05509274296caf99e963d45959 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 13:35:16 -0700 Subject: [PATCH 124/237] lint changes and comments --- cpp-package/include/mxnet-cpp/kvstore.h | 3 ++- cpp-package/include/mxnet-cpp/kvstore.hpp | 2 +- include/mxnet/ndarray.h | 4 ++-- python/mxnet/gluon/trainer.py | 2 +- python/mxnet/kvstore.py | 11 +++++---- src/kvstore/comm.h | 12 +++++----- src/kvstore/kvstore_dist.h | 26 ++++++++++++--------- src/kvstore/kvstore_dist_server.h | 3 +-- src/kvstore/kvstore_local.h | 2 +- src/ndarray/ndarray.cc | 6 ++--- src/ndarray/ndarray_function.cc | 4 ++-- src/ndarray/ndarray_function.cu | 4 ++-- src/operator/contrib/two_bit_quantize-inl.h | 13 ++++++----- src/operator/contrib/two_bit_quantize.cc | 2 +- 14 files changed, 50 insertions(+), 44 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h index 45d7bfacacda..f25150b1aa03 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.h +++ b/cpp-package/include/mxnet-cpp/kvstore.h @@ -37,7 +37,8 @@ class KVStore { public: static void SetType(const std::string& type); static void RunServer(); - static void SetCompress(const std::string& compress, const float neg_threshold, const float pos_threshold); + static void SetCompress(const std::string& compress, + const float neg_threshold, const float pos_threshold); static void Init(int key, const NDArray& val); static void Init(const std::vector& keys, const std::vector& vals); static void Push(int key, const NDArray& val, int priority = 0); diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp index 038c88745447..b9c4cad86f02 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.hpp +++ b/cpp-package/include/mxnet-cpp/kvstore.hpp @@ -85,7 +85,7 @@ inline void KVStore::RunServer() { inline void KVStore::SetCompress(const std::string& compress, const float neg_threshold, const float pos_threshold) { CHECK_EQ(MXKVStoreSetCompress(get_kvstore()->get_handle(), - compress.c_str(), neg_threshold, pos_threshold),0); + compress.c_str(), neg_threshold, pos_threshold), 0); } inline void KVStore::Init(int key, const NDArray& val) { diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index b26197d1b5a5..9fed78f0854c 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -904,7 +904,7 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); * \param pos_threshold positive threshold for 2bit quantization * \param priority Priority of the action. */ -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& compress, +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, const float neg_threshold, const float pos_threshold, int priority); @@ -916,7 +916,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& * \param compress type of compression * \param priority Priority of the action. */ -void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority); +void Dequantize(const NDArray &from, NDArray *to, const std::string& compress, int priority); /*! * \brief issue an copy operation from one NDArray to another diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 227370bd38af..d13691723a64 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -74,7 +74,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) self._params.append(param) - if compress_params : + if compress_params: if not isinstance(compress_params, dict): raise ValueError("compress_params needs to be a dictionary") self._compress_params = compress_params if compress_params else {'compress':'none'} diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 7877f9331b9e..4cc9591ef4b3 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -349,7 +349,7 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): check_call(_LIB.MXKVStorePullRowSparse( self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_compress(self, compress_params={'compress':'none'}): + def set_compress(self, compress_params=None): """ Specifies type of low-bit quantization for gradient compression if any, and additional arguments depending on the type of compression being used. @@ -414,11 +414,12 @@ def set_compress(self, compress_params={'compress':'none'}): negative threshold. Negative values greater than negative threshold will be set to 0. """ + compress_params = compress_params if compress_params else {'compress':'none'} if 'compress' not in compress_params: raise ValueError('compress_params requires compress to be set') elif not isinstance(compress_params['compress'], string_types): raise TypeError('compress must be a string') - elif compress_params['compress'] not in ['none','2bit']: + elif compress_params['compress'] not in ['none', '2bit']: raise ValueError('Unsupported type of compression') if compress_params['compress'] == '2bit': @@ -440,9 +441,9 @@ def set_compress(self, compress_params={'compress':'none'}): if compress_params['compress'] == '2bit': check_call(_LIB.MXKVStoreSetCompress(self.handle, - c_str(compress_params['compress']), - mx_float(compress_params['neg_threshold']), - mx_float(compress_params['pos_threshold']))) + c_str(compress_params['compress']), + mx_float(compress_params['neg_threshold']), + mx_float(compress_params['pos_threshold']))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index fc2b55e37aee..417cc7c6e6d9 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -507,7 +507,7 @@ class CommDevice : public Comm { int priority) override { // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore - if (src.size() == 1 && compress_=="none") { + if (src.size() == 1 && compress_ == "none") { return src[0]; } @@ -525,7 +525,7 @@ class CommDevice : public Comm { auto& buf = merge_buf_[key]; std::vector reduce(src.size()); - if (compress_=="none"){ + if (compress_ == "none") { CopyFromTo(src[0], &(buf.merged), priority); reduce[0] = buf.merged; @@ -560,7 +560,7 @@ class CommDevice : public Comm { buf.residual[i] = 0; if (compress_ == "2bit") { int bits = 16; - long int small_size = buf.merged.shape().Size() % bits == 0 ? + int64_t small_size = buf.merged.shape().Size() % bits == 0 ? buf.merged.shape().Size() / bits + 3 : buf.merged.shape().Size() / bits + 4; buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), @@ -568,7 +568,7 @@ class CommDevice : public Comm { buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); } else { - LOG(FATAL) << "Unsupported type of compression "<Slice(cur_to, cur_to + pskv.lens[i]); @@ -532,7 +535,8 @@ class KVStoreDist : public KVStoreLocal { NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); NDArray part_res = res_buf->Slice(cur_from, end_part_data); - Quantize(part_data, &part_compr, &part_res, compress_, neg_threshold_, pos_threshold_, priority); + Quantize(part_data, &part_compr, &part_res, compress_, + neg_threshold_, pos_threshold_, priority); cur_from = end_part_data; cur_to = cur_to + pskv.lens[i]; @@ -545,7 +549,7 @@ class KVStoreDist : public KVStoreLocal { } PSKV& EncodeKey(int key, size_t size, bool is_push) { - if (compress_!="none") { + if (compress_ != "none") { return EncodeCompressedKey(key, size, is_push); } else { return EncodeDefaultKey(key, size, is_push); @@ -565,7 +569,7 @@ class KVStoreDist : public KVStoreLocal { if (compress_ == "2bit") { bits = 16; } else { - LOG(FATAL)<<"Unsupported compression type"; + LOG(FATAL) << "Unsupported compression type"; } // represents size of data to be sent diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 044ef21b2c9f..a677806ed47c 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -388,7 +388,7 @@ class KVStoreDistServer { NDArray decomp_buf = decomp_buf_[key]; if (compress_ != "none") { if (compress_ == "2bit") { - long int original_size = (long int)(*(recv_blob.dptr()+2)); + int64_t original_size = (int64_t) (*(recv_blob.dptr()+2)); // changing dshape to original size as value is stored in // original size even when compressed data is received dshape = TShape{original_size}; @@ -504,7 +504,6 @@ class KVStoreDistServer { * by worker by sending a command to server */ std::string compress_ = "none"; - }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index da4c8a99af2d..c177cb2eecc5 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -135,7 +135,7 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } - virtual void SetCompress(const std::string& compress, const float neg_threshold, + void SetCompress(const std::string& compress, const float neg_threshold, const float pos_threshold) override { compress_ = compress; pos_threshold_ = pos_threshold; diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index f263895032b4..e6559bd091cd 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -558,7 +558,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { } } -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& compress, +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, const float neg_threshold, const float pos_threshold, int priority) { CHECK(from.shape().ndim() != 0) @@ -609,7 +609,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, std::string& } } -void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int priority) { +void Dequantize(const NDArray &from, NDArray *to, const std::string& compress, int priority) { CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; // important: callback must always capture by value @@ -642,7 +642,7 @@ void Dequantize(const NDArray &from, NDArray *to, std::string& compress, int pri }, from.ctx(), {from.var()}, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { - LOG(FATAL) << "Unsupported dequantization "<(mshadow::Stream* s, */ template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s,inputs); + mxnet::op::Dequantize2BitImpl(s, inputs); } /* @@ -198,7 +198,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 197ac1e7970f..6c2924881a56 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -208,7 +208,7 @@ void ElementwiseSum(mshadow::Stream* s, */ template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s,inputs); + mxnet::op::Dequantize2BitImpl(s, inputs); } /* @@ -217,7 +217,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s,inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index b7128c562836..42640fd30af6 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -109,7 +109,8 @@ struct init_threshold_2bit { // The third element is the original size of the array out[0] = neg_threshold; out[1] = pos_threshold; - out[2] = (float)size; + // TODO(huilgolr) check potential problem here? + out[2] = static_cast(size); } }; @@ -127,18 +128,18 @@ struct quantize_2bit { // start and end are indices in original grad array int start = block_id*16; int end = (start+16 <= grad_size) ? start+16 : grad_size; - char* block_ptr = reinterpret_cast(compr_block); - for (int i=start; i (compr_block); + for (int i=start; i < end; i++){ char* curr_byte = block_ptr + (i-start)/4; float curr_value = grad[i] + residual[i]; if (curr_value >= pos_threshold) { residual[i] = curr_value - pos_threshold; // set data to 10 - (*curr_byte) |= (2u<<(6-((i%4)*2))); + (*curr_byte) |= (2u << (6-((i%4)*2))); } else if (curr_value <= neg_threshold) { residual[i] = curr_value - neg_threshold; // set data to 01 - (*curr_byte) |= (1u<<(6-((i%4)*2))); + (*curr_byte) |= (1u << (6-((i%4)*2))); } else { // leave data as 00 residual[i] = curr_value; @@ -312,7 +313,7 @@ inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, // check input CHECK(!shape_is_none(in_attrs->at(0))); CHECK(!shape_is_none(in_attrs->at(1))); - //TODO(huilgolr) check + // TODO(huilgolr) check CHECK_LE(in_attrs->at(1).Size(), in_attrs->at(0).Size()*16) << "The shape of the second input array are " diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 0ad37d2cbab3..6221aa77e871 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -71,7 +71,7 @@ The residual is also updated to [1.0, -3.0, -1.0, -3.0]. .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) .set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{1,2}; + return std::vector{1, 2}; }) .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") From 5e849e10a06b441443ea8329d788e21cc6400bc8 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 22:28:11 +0000 Subject: [PATCH 125/237] speed up operator test by reducing asnumpy() calls --- tests/python/unittest/test_operator.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index d0f648eb417a..74b58548af24 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4311,8 +4311,8 @@ def check(data, idx): def test_two_bit_quantization(): neg_threshold = -0.5 pos_threshold = 0.5 - orig_shape = [(25,),(16,),(1121),(144000)] - num_repeat = 3 + orig_shape = [(25,),(16,),(1121),(14400)] + num_repeat = 1 from struct import pack,unpack def bits2int(bits): @@ -4330,8 +4330,10 @@ def compute_expected(arr, neg, pos, curr_residual): str_quant = '' new_residual = [] decompr = [] - for i, a in np.ndenumerate(arr.asnumpy()): - a += curr_residual.asnumpy()[i] + arr_npy = arr.asnumpy() + curr_res_npy = curr_residual.asnumpy() + for i, a in np.ndenumerate(arr_npy): + a += curr_res_npy[i] if a >= pos: str_quant += '10' new_residual.append(a - pos) @@ -4363,6 +4365,7 @@ def check(grad, residual): mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) + mx.nd.waitall() assert np.array_equal(compr.asnumpy(), np.array(exp_compr)) , (compr, exp_compr) assert np.array_equal(decompr.asnumpy(), np.array(exp_decompr)) , (decompr, exp_decompr) # use almost equal for residual as this involves addition operation @@ -4383,7 +4386,7 @@ def onesdata(shape): def random_data(shape): # push random data and residual grad = mx.nd.random_uniform(-0.9,0.9, shape=shape, ctx=default_context()) - residual = mx.nd.random_uniform(-1,1, shape=shape, ctx=default_context()) + residual = mx.nd.random_uniform(-0.6,0.6, shape=shape, ctx=default_context()) return grad, residual def random_large_range(shape): @@ -4391,11 +4394,6 @@ def random_large_range(shape): residual = mx.nd.random_uniform(-2,2, shape=shape, ctx=default_context()) return grad, residual - def random_small_range(shape): - grad = mx.nd.random_uniform(-0.6,6, shape=shape, ctx=default_context()) - residual = mx.nd.random_uniform(-0.1,0.1, shape=shape, ctx=default_context()) - return grad, residual - for shape in orig_shape: for i in range(num_repeat): data = [] @@ -4403,7 +4401,6 @@ def random_small_range(shape): data.append(onesdata(shape)) data.append(random_data(shape)) data.append(random_large_range(shape)) - data.append(random_small_range(shape)) for d in data: check(d[0], d[1]) From 69608da7389ab6d5ad2bb37d54f86fcbdfeb836f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 16:47:24 -0700 Subject: [PATCH 126/237] random data for test_kvstore_local --- tests/nightly/dist_sync_kvstore.py | 5 ++-- tests/nightly/test_kvstore.py | 32 ++++++++++++++++++++++++-- tests/python/unittest/test_operator.py | 1 - 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 1eebe0028ac9..074f93da63e2 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -250,15 +250,16 @@ def check_compr_random(kv, pos, neg, nworker): val = mx.nd.zeros(s) kv.pull(k, val) - expected_diff = val - orig_val + diff = val - orig_val + # compute expected by directly using operators compr = mx.contrib.nd.create_2bit(grad_cpy) mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, neg, pos) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) decompr *= nworker * rate - assert_almost_equal(expected_diff, decompr) + assert_almost_equal(diff, decompr) print ('worker '+str(my_rank)+' started') check_default_keys(kv, my_rank, nworker) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 7a9f40169348..cf736bf7914c 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -109,27 +109,55 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): for o in out: check_diff_to_scalar(o, curval) - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.2) for g in range(nworker)]) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.3) for g in range(nworker)]) out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j],out=out) curval += pos_threshold*rate*nworker for o in out: check_diff_to_scalar(o, curval) + # residual would be 0 now return curval def check_ones(kv, pos, rate, curval): newval = curval + rate*nworker*pos for j in range(len(keys)): - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*pos*4 for g in range(nworker)]) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*pos for g in range(nworker)]) out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=out) for o in out: check_diff_to_scalar(o, newval) + # residual would be 0 again + + def check_compr_random(kv, pos, neg): + for j in range(len(keys)): + orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(k, out=orig_val) + + grads = [mx.nd.random_uniform(-0.6,0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] + kv.push(k, grads) + val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(k, out=val) + + diff = val - orig_val + + # compute expected by directly using operators + comprs = [] + decomprs = [] + # on cpu + sum_dequantized_vals = np.zeros(shapes[j]) + for g in range(nworker): + comprs.append(mx.contrib.nd.create_2bit(grads[g])) + decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) + mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(s, ctx=mx.gpu(g)), comprs[g], neg, pos) + mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) + sum_dequantized_vals += decomprs[g].asnumpy() + assert_almost_equal(diff.asnumpy(), sum_dequantized_vals) pull_before_push(kv) push_zeros(kv) curval = verify_residual(kv, neg, pos, rate) check_ones(kv, pos, rate, curval) + check_compr_random(kv, pos, neg) test_kvstore('local_update_cpu') test_kvstore('local_allreduce_cpu') diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 74b58548af24..91826d54efd6 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4365,7 +4365,6 @@ def check(grad, residual): mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr) - mx.nd.waitall() assert np.array_equal(compr.asnumpy(), np.array(exp_compr)) , (compr, exp_compr) assert np.array_equal(decompr.asnumpy(), np.array(exp_decompr)) , (decompr, exp_decompr) # use almost equal for residual as this involves addition operation From 847a7f2e7454dccc69cb688f82b9a53c100dabb7 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 16:59:20 -0700 Subject: [PATCH 127/237] fix variable confusion error in test --- tests/nightly/test_kvstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index cf736bf7914c..0d78b36d7e4c 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -131,12 +131,12 @@ def check_ones(kv, pos, rate, curval): def check_compr_random(kv, pos, neg): for j in range(len(keys)): orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(k, out=orig_val) + kv.pull(keys[j], out=orig_val) grads = [mx.nd.random_uniform(-0.6,0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] - kv.push(k, grads) + kv.push(keys[j], grads) val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(k, out=val) + kv.pull(keys[j], out=val) diff = val - orig_val @@ -148,7 +148,7 @@ def check_compr_random(kv, pos, neg): for g in range(nworker): comprs.append(mx.contrib.nd.create_2bit(grads[g])) decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) - mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(s, ctx=mx.gpu(g)), comprs[g], neg, pos) + mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) sum_dequantized_vals += decomprs[g].asnumpy() assert_almost_equal(diff.asnumpy(), sum_dequantized_vals) From 2f8e86ea08d607b28f0e99d4b5629c131b1c027a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 19 Oct 2017 00:42:42 +0000 Subject: [PATCH 128/237] fix randomized data test for local kvstore --- tests/nightly/test_kvstore.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 0d78b36d7e4c..cf8d396e3668 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -132,26 +132,25 @@ def check_compr_random(kv, pos, neg): for j in range(len(keys)): orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=orig_val) - grads = [mx.nd.random_uniform(-0.6,0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] kv.push(keys[j], grads) val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=val) - - diff = val - orig_val + diff = [val[g] - orig_val[g] for g in range(nworker)] # compute expected by directly using operators comprs = [] - decomprs = [] + decomprs = [] # on cpu sum_dequantized_vals = np.zeros(shapes[j]) for g in range(nworker): comprs.append(mx.contrib.nd.create_2bit(grads[g])) decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) - mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) + mx.contrib.ndarray.quantize_2bit(grads[g], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) - sum_dequantized_vals += decomprs[g].asnumpy() - assert_almost_equal(diff.asnumpy(), sum_dequantized_vals) + sum_dequantized_vals += ((decomprs[g]*rate).asnumpy()) + for g in range(nworker): + assert_almost_equal(diff[g].asnumpy(), sum_dequantized_vals) pull_before_push(kv) push_zeros(kv) From 69af018a8d9c4f0fd6758db8877dacb336507359 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 18 Oct 2017 17:50:24 -0700 Subject: [PATCH 129/237] add nrepeat for test_kvstore --- tests/nightly/test_kvstore.py | 70 ++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 0d78b36d7e4c..e6f0218be2e7 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -118,45 +118,49 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): # residual would be 0 now return curval - def check_ones(kv, pos, rate, curval): - newval = curval + rate*nworker*pos - for j in range(len(keys)): - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*pos for g in range(nworker)]) - out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(keys[j], out=out) - for o in out: - check_diff_to_scalar(o, newval) - # residual would be 0 again + def check_neg(kv, neg, rate, curval): + for r in range(nrepeat): + curval = curval + rate*nworker*neg + for j in range(len(keys)): + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*neg for g in range(nworker)]) + out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j], out=out) + for o in out: + check_diff_to_scalar(o, curval) + # residual would be 0 again def check_compr_random(kv, pos, neg): - for j in range(len(keys)): - orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(keys[j], out=orig_val) - - grads = [mx.nd.random_uniform(-0.6,0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] - kv.push(keys[j], grads) - val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(keys[j], out=val) - - diff = val - orig_val - - # compute expected by directly using operators - comprs = [] - decomprs = [] - # on cpu - sum_dequantized_vals = np.zeros(shapes[j]) - for g in range(nworker): - comprs.append(mx.contrib.nd.create_2bit(grads[g])) - decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) - mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) - mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) - sum_dequantized_vals += decomprs[g].asnumpy() - assert_almost_equal(diff.asnumpy(), sum_dequantized_vals) + for r in range(nrepeat): + for j in range(len(keys)): + orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j], out=orig_val) + + grads = [mx.nd.random_uniform(-0.6,0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] + kv.push(keys[j], grads) + val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] + kv.pull(keys[j], out=val) + + diff = [val[g] - orig_val[g] for g in range(nworker)] + + # compute expected by directly using operators + comprs = [] + decomprs = [] + # on cpu + sum_dequantized_vals = np.zeros(shapes[j]) + for g in range(nworker): + comprs.append(mx.contrib.nd.create_2bit(grads[g])) + decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) + mx.contrib.ndarray.quantize_2bit(grads[j], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) + mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) + sum_dequantized_vals += decomprs[g].asnumpy() + for g in range(nworker): + assert_almost_equal(diff[g].asnumpy(), sum_dequantized_vals) + # residual is random now pull_before_push(kv) push_zeros(kv) curval = verify_residual(kv, neg, pos, rate) - check_ones(kv, pos, rate, curval) + check_neg(kv, neg, rate, curval) check_compr_random(kv, pos, neg) test_kvstore('local_update_cpu') From 39e2d22527509292b5f9d3bff4ae402e841d4467 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 19 Oct 2017 01:09:00 +0000 Subject: [PATCH 130/237] change keys after merge from master introduced same keys --- tests/nightly/dist_sync_kvstore.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 074f93da63e2..4cd8f22814c3 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -62,9 +62,9 @@ def init_kv_compressed(kv): neg_threshold = -0.5 kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) # init kv compression keys - kv.init('221', mx.nd.zeros(big_shape)) - kv.init('2221', mx.nd.zeros(irregular_shape)) - kv.init('21', mx.nd.zeros(shape)) + kv.init('11221', mx.nd.zeros(big_shape)) + kv.init('112221', mx.nd.zeros(irregular_shape)) + kv.init('1121', mx.nd.zeros(shape)) return kv, pos_threshold, neg_threshold def test_sync_push_pull(): @@ -181,7 +181,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): check_diff_to_scalar(val, expected, rank=my_rank) def check_compr_residual(kv, pos_threshold, nworker): - for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: # doesn't meet threshold kv.push(k, mx.nd.ones(s)*0.4) val=mx.nd.zeros(s) @@ -210,7 +210,7 @@ def check_compr_residual(kv, pos_threshold, nworker): # residual is 0 now def check_compr_ones(kv, pos, nworker): - for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: val = mx.nd.zeros(s) kv.pull(k, val) curval = val[0][0].asnumpy()[0] @@ -222,13 +222,13 @@ def check_compr_ones(kv, pos, nworker): # residual = 0 again def check_compr_pull_before_push(kv): - for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: val = mx.nd.ones(s) kv.pull(k, val) check_diff_to_scalar(val, 0) def check_compr_zero(kv): - for k,s in [('21', shape),('2221',irregular_shape),('221', big_shape)]: + for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: kv.push(k, mx.nd.zeros(s)) # to check that all are set to 0s val = mx.nd.ones(s) @@ -240,7 +240,7 @@ def check_compr_random(kv, pos, neg, nworker): # calculate expected value after pull mx.random.seed(123) rnd.seed(123) - for k,s in [('2221',irregular_shape),('221', big_shape), ('21', shape)]: + for k,s in [('112221',irregular_shape),('11221', big_shape), ('1121', shape)]: orig_val = mx.nd.zeros(s) kv.pull(k, orig_val) From bf3ea61a6a3d1c53a0d37d33ed05f1fcc7b924f9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 19 Oct 2017 01:51:57 +0000 Subject: [PATCH 131/237] correct test which fails because grad changes --- tests/nightly/dist_sync_kvstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 4cd8f22814c3..a88e9922ad4f 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -245,13 +245,13 @@ def check_compr_random(kv, pos, neg, nworker): kv.pull(k, orig_val) grad = mx.nd.array(rnd.rand(s[0], s[1])) + # creates a copy because pull changes grad grad_cpy = mx.nd.array(grad) kv.push(k, grad) val = mx.nd.zeros(s) kv.pull(k, val) diff = val - orig_val - # compute expected by directly using operators compr = mx.contrib.nd.create_2bit(grad_cpy) mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, neg, pos) @@ -259,7 +259,7 @@ def check_compr_random(kv, pos, neg, nworker): mx.contrib.ndarray.dequantize_2bit(compr, decompr) decompr *= nworker * rate - assert_almost_equal(diff, decompr) + assert_almost_equal(diff.asnumpy(), decompr.asnumpy()) print ('worker '+str(my_rank)+' started') check_default_keys(kv, my_rank, nworker) From 9c9ae589ecd45d96e1825a3b71fc3c3464802bf7 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 21 Oct 2017 23:59:31 -0700 Subject: [PATCH 132/237] change to bit ops --- python/mxnet/kvstore.py | 1 - src/kvstore/comm.h | 4 +-- src/operator/contrib/two_bit_quantize-inl.h | 37 ++++++++++++--------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 4cc9591ef4b3..608797bea46c 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -439,7 +439,6 @@ def set_compress(self, compress_params=None): raise ValueError('pos_threshold needs to be greater than 0, \ and neg_threshold needs to be less than 0') - if compress_params['compress'] == '2bit': check_call(_LIB.MXKVStoreSetCompress(self.handle, c_str(compress_params['compress']), mx_float(compress_params['neg_threshold']), diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 417cc7c6e6d9..3058bdb95709 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -84,8 +84,8 @@ class Comm { * perform reduce with compressed gradients */ inline void SetCompress(const std::string& compress, - const float neg_threshold, - const float pos_threshold) { + const float neg_threshold, + const float pos_threshold) { compress_ = compress; pos_threshold_ = pos_threshold; neg_threshold_ = neg_threshold; diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 42640fd30af6..dd9ed1773d29 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -30,6 +30,7 @@ #include "../elemwise_op_common.h" #include "../mshadow_op.h" #include "../mxnet_op.h" +#include namespace mxnet { namespace op { @@ -126,20 +127,24 @@ struct quantize_2bit { // init to 0 *compr_block = 0; // start and end are indices in original grad array - int start = block_id*16; - int end = (start+16 <= grad_size) ? start+16 : grad_size; + int start = block_id << 4; + int end = (start + 16 <= grad_size) ? start + 16 : grad_size; char* block_ptr = reinterpret_cast < char* > (compr_block); - for (int i=start; i < end; i++){ - char* curr_byte = block_ptr + (i-start)/4; + +// char* curr_byte = block_ptr; + for (int i = start; i < end; i++) { + // // adds 1 when i-start divisible by 4 + char * curr_byte = block_ptr + ((i-start) >> 2); float curr_value = grad[i] + residual[i]; + if (curr_value >= pos_threshold) { residual[i] = curr_value - pos_threshold; // set data to 10 - (*curr_byte) |= (2u << (6-((i%4)*2))); + *curr_byte |= (2u << (6 - ((i & 3) << 1))); } else if (curr_value <= neg_threshold) { residual[i] = curr_value - neg_threshold; // set data to 01 - (*curr_byte) |= (1u << (6-((i%4)*2))); + *curr_byte |= (1u << (6 - ((i & 3) << 1))); } else { // leave data as 00 residual[i] = curr_value; @@ -157,7 +162,7 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, neg_threshold, pos_threshold, inputs[0].Size()); // Finally, compress the data and calculate new residual - mxnet_op::Kernel::Launch(s, inputs[2].Size()-3, + mxnet_op::Kernel::Launch(s, (inputs[2].Size()-3), inputs[0].Size(), // original grad size inputs[2].dptr()+3, // compressed array inputs[0].dptr(), // input array @@ -223,15 +228,15 @@ struct dequantize_2bit { float *neg_threshold, float *pos_threshold) { // get block ptr - int block_id = i / 16; - char* ch_ptr = reinterpret_cast(in+block_id); +// char* ch_ptr = reinterpret_cast(in + (i << 4)); + // get row ptr - int row_id = (i%16)/4; - ch_ptr += row_id; + char* ch_ptr = (reinterpret_cast(in + (i >> 4))) + ((i & 15) >> 2); + // get column id - int col_id = (i%16)%4; +// int col_id = (i & 15) & 3; // Decompress - switch (col_id) { + switch ((i & 15) & 3) { case 0: // positve if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 @@ -248,7 +253,7 @@ struct dequantize_2bit { if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 out[i] = *pos_threshold; // negative - } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 + } else if ( ((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 out[i] = *neg_threshold; } else { // 0 out[i] = 0; @@ -256,7 +261,7 @@ struct dequantize_2bit { break; case 2: // positve - if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 + if ( ((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 @@ -267,7 +272,7 @@ struct dequantize_2bit { break; case 3: // positve - if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 + if (((*ch_ptr) & (0x03))== 0x02) { // binary: 00(10) 0000 out[i] = *pos_threshold; // negative } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 From 5c42ebb1bb290a1b41083a1f5f2f931b79d12378 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 23 Oct 2017 10:53:14 -0700 Subject: [PATCH 133/237] change to bit ops --- cpp-package/include/mxnet-cpp/kvstore.h | 4 +- cpp-package/include/mxnet-cpp/kvstore.hpp | 5 +- src/operator/contrib/two_bit_quantize-inl.h | 63 ++++++++++++++------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h index f25150b1aa03..d8ba533254d6 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.h +++ b/cpp-package/include/mxnet-cpp/kvstore.h @@ -37,8 +37,8 @@ class KVStore { public: static void SetType(const std::string& type); static void RunServer(); - static void SetCompress(const std::string& compress, - const float neg_threshold, const float pos_threshold); + static void SetGradientCompression(const std::string& compress_type, + const float threshold); static void Init(int key, const NDArray& val); static void Init(const std::vector& keys, const std::vector& vals); static void Push(int key, const NDArray& val, int priority = 0); diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp index b9c4cad86f02..a34851cf5591 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.hpp +++ b/cpp-package/include/mxnet-cpp/kvstore.hpp @@ -82,10 +82,9 @@ inline void KVStore::RunServer() { CHECK_EQ(MXKVStoreRunServer(get_kvstore()->get_handle(), &Controller, 0), 0); } -inline void KVStore::SetCompress(const std::string& compress, - const float neg_threshold, const float pos_threshold) { +inline void KVStore::SetGradientCompression(const std::string& compress_type, const float threshold) { CHECK_EQ(MXKVStoreSetCompress(get_kvstore()->get_handle(), - compress.c_str(), neg_threshold, pos_threshold), 0); + compress_type.c_str(), threshold), 0); } inline void KVStore::Init(int key, const NDArray& val) { diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index dd9ed1773d29..2fc203760474 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -35,6 +35,12 @@ namespace mxnet { namespace op { +// branchless +template int sgn(T val) { + return (T(0) < val) - (val < T(0)); +} + + struct init_mem_2bit { // Initialize output array MSHADOW_XINLINE static void Map(int i, float* out) { @@ -131,24 +137,39 @@ struct quantize_2bit { int end = (start + 16 <= grad_size) ? start + 16 : grad_size; char* block_ptr = reinterpret_cast < char* > (compr_block); -// char* curr_byte = block_ptr; for (int i = start; i < end; i++) { // // adds 1 when i-start divisible by 4 char * curr_byte = block_ptr + ((i-start) >> 2); - float curr_value = grad[i] + residual[i]; - if (curr_value >= pos_threshold) { - residual[i] = curr_value - pos_threshold; - // set data to 10 - *curr_byte |= (2u << (6 - ((i & 3) << 1))); - } else if (curr_value <= neg_threshold) { - residual[i] = curr_value - neg_threshold; - // set data to 01 - *curr_byte |= (1u << (6 - ((i & 3) << 1))); - } else { - // leave data as 00 - residual[i] = curr_value; +// float curr_value = grad[i] + residual[i]; + residual[i] += grad[i]; + float curr_value = residual[i]; + // for pos, sign = 1 + // for neg, sign = 0 + // for 0, sign is 0 here, but we don't use it + int sign = curr_value > 0 ; + if (sign * curr_value >= pos_threshold ) { + // set 11 for positive + // set 10 for negative + *curr_byte |= ((sign + 2) << (6 - ((i & 3) << 1))); + // for positive, subtract threshold + // for negative, subtract neg threshold = adding positive threshold + residual[i] -= sign * pos_threshold; } + +// if (curr_value >= pos_threshold) { +// residual[i] = curr_value - pos_threshold; +// set data to 10 +// *curr_byte |= (2u << (6 - ((i & 3) << 1))); +// } else if (curr_value <= neg_threshold) { +// residual[i] = curr_value - neg_threshold; +// set data to 01 +// *curr_byte |= (1u << (6 - ((i & 3) << 1))); +// } +// else { + // leave data as 00 +// residual[i] += curr_value; +// } } } }; @@ -239,10 +260,10 @@ struct dequantize_2bit { switch ((i & 15) & 3) { case 0: // positve - if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 + if (((*ch_ptr) & (0xc0)) == 0xc0) { // binary: (11)00 0000 out[i] = *pos_threshold; // negative - } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 + } else if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 out[i] = *neg_threshold; } else { // 0 out[i] = 0; @@ -250,10 +271,10 @@ struct dequantize_2bit { break; case 1: // positve - if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 + if (((*ch_ptr) & (0x30)) == 0x30) { // binary: 00(11) 0000 out[i] = *pos_threshold; // negative - } else if ( ((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 + } else if ( ((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 out[i] = *neg_threshold; } else { // 0 out[i] = 0; @@ -261,10 +282,10 @@ struct dequantize_2bit { break; case 2: // positve - if ( ((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 + if ( ((*ch_ptr) & (0x0c)) == 0x0c) { // binary: 0000 (11)00 out[i] = *pos_threshold; // negative - } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 + } else if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 0000 (10)00 out[i] = *neg_threshold; } else { // 0 out[i] = 0; @@ -272,10 +293,10 @@ struct dequantize_2bit { break; case 3: // positve - if (((*ch_ptr) & (0x03))== 0x02) { // binary: 00(10) 0000 + if (((*ch_ptr) & (0x03))== 0x03) { out[i] = *pos_threshold; // negative - } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 + } else if (((*ch_ptr) & (0x03)) == 0x02) { out[i] = *neg_threshold; } else { // 0 out[i] = 0; From 49e4ee0eb271ab6919400b5af8463bdad27610ab Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 24 Oct 2017 15:10:49 -0700 Subject: [PATCH 134/237] use bit array and revert sign changes --- src/operator/contrib/two_bit_quantize-inl.h | 26 ++++++++++----------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index dd9ed1773d29..cc5279f82a02 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -130,24 +130,22 @@ struct quantize_2bit { int start = block_id << 4; int end = (start + 16 <= grad_size) ? start + 16 : grad_size; char* block_ptr = reinterpret_cast < char* > (compr_block); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x10, 0x08, 0x01}; + char* curr_byte = block_ptr; -// char* curr_byte = block_ptr; for (int i = start; i < end; i++) { // // adds 1 when i-start divisible by 4 - char * curr_byte = block_ptr + ((i-start) >> 2); - float curr_value = grad[i] + residual[i]; - - if (curr_value >= pos_threshold) { - residual[i] = curr_value - pos_threshold; - // set data to 10 - *curr_byte |= (2u << (6 - ((i & 3) << 1))); - } else if (curr_value <= neg_threshold) { - residual[i] = curr_value - neg_threshold; + curr_byte += ((i-start) & 3); + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + residual[i] -= pos_threshold; + // set data to 11 + *curr_byte |= posbits[(i & 3)]; + } else if (residual[i] <= neg_threshold) { + residual[i] -= neg_threshold; // set data to 01 - *curr_byte |= (1u << (6 - ((i & 3) << 1))); - } else { - // leave data as 00 - residual[i] = curr_value; + *curr_byte |= negbits[(i & 3)]; } } } From f74d317c7685a0961f66bf5802e0c2ed14544d5c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 24 Oct 2017 15:16:51 -0700 Subject: [PATCH 135/237] correct bits setting to 10 as 2 --- src/operator/contrib/two_bit_quantize-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index d04e6d3615f5..a027001b7714 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -137,7 +137,7 @@ struct quantize_2bit { int end = (start + 16 <= grad_size) ? start + 16 : grad_size; char* block_ptr = reinterpret_cast < char* > (compr_block); const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x10, 0x08, 0x01}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; char* curr_byte = block_ptr; for (int i = start; i < end; i++) { @@ -150,7 +150,7 @@ struct quantize_2bit { *curr_byte |= posbits[(i & 3)]; } else if (residual[i] <= neg_threshold) { residual[i] -= neg_threshold; - // set data to 01 + // set data to 10 *curr_byte |= negbits[(i & 3)]; } } From b67a392069ef08795d9dde82b156c4051e1dd73f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 24 Oct 2017 15:23:59 -0700 Subject: [PATCH 136/237] remove switch in dequantize --- src/operator/contrib/two_bit_quantize-inl.h | 118 +++++++++++--------- 1 file changed, 64 insertions(+), 54 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index a027001b7714..dcbb7f8ac19b 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -231,64 +231,74 @@ struct dequantize_2bit { float *in, float *neg_threshold, float *pos_threshold) { - // get block ptr -// char* ch_ptr = reinterpret_cast(in + (i << 4)); - // get row ptr char* ch_ptr = (reinterpret_cast(in + (i >> 4))) + ((i & 15) >> 2); - // get column id -// int col_id = (i & 15) & 3; - // Decompress - switch ((i & 15) & 3) { - case 0: - // positve - if (((*ch_ptr) & (0xc0)) == 0xc0) { // binary: (11)00 0000 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 1: - // positve - if (((*ch_ptr) & (0x30)) == 0x30) { // binary: 00(11) 0000 - out[i] = *pos_threshold; - // negative - } else if ( ((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 2: - // positve - if ( ((*ch_ptr) & (0x0c)) == 0x0c) { // binary: 0000 (11)00 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 0000 (10)00 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 3: - // positve - if (((*ch_ptr) & (0x03))== 0x03) { - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0x03)) == 0x02) { - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - default: - break; + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + + int col = (i & 15) & 3; + if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { + out[i] = *pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { + out[i] = *neg_threshold; + } else { + out[i] = 0; } - } +// +// // get column id +// // Decompress +// switch ((i & 15) & 3) { +// case 0: +// // positve +// if (((*ch_ptr) & (0xc0)) == 0xc0) { // binary: (11)00 0000 +// out[i] = *pos_threshold; +// // negative +// } else if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 +// out[i] = *neg_threshold; +// } else { // 0 +// out[i] = 0; +// } +// break; +// case 1: +// // positve +// if (((*ch_ptr) & (0x30)) == 0x30) { // binary: 00(11) 0000 +// out[i] = *pos_threshold; +// // negative +// } else if ( ((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 +// out[i] = *neg_threshold; +// } else { // 0 +// out[i] = 0; +// } +// break; +// case 2: +// // positve +// if ( ((*ch_ptr) & (0x0c)) == 0x0c) { // binary: 0000 (11)00 +// out[i] = *pos_threshold; +// // negative +// } else if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 0000 (10)00 +// out[i] = *neg_threshold; +// } else { // 0 +// out[i] = 0; +// } +// break; +// case 3: +// // positve +// if (((*ch_ptr) & (0x03))== 0x03) { +// out[i] = *pos_threshold; +// // negative +// } else if (((*ch_ptr) & (0x03)) == 0x02) { +// out[i] = *neg_threshold; +// } else { // 0 +// out[i] = 0; +// } +// break; +// default: +// break; +// } + } }; template From 804f7d1fcf2cc10da893afc769262c3cb5ec490f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 24 Oct 2017 23:18:56 +0000 Subject: [PATCH 137/237] image classification example changes and remove cpp-api --- cpp-package/include/mxnet-cpp/kvstore.h | 2 -- cpp-package/include/mxnet-cpp/kvstore.hpp | 5 ----- example/image-classification/common/fit.py | 2 +- example/image-classification/symbols/mlp.py | 10 ++++++---- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h index d8ba533254d6..9c3c81f37ff7 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.h +++ b/cpp-package/include/mxnet-cpp/kvstore.h @@ -37,8 +37,6 @@ class KVStore { public: static void SetType(const std::string& type); static void RunServer(); - static void SetGradientCompression(const std::string& compress_type, - const float threshold); static void Init(int key, const NDArray& val); static void Init(const std::vector& keys, const std::vector& vals); static void Push(int key, const NDArray& val, int priority = 0); diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp index a34851cf5591..f2b5e74990ce 100644 --- a/cpp-package/include/mxnet-cpp/kvstore.hpp +++ b/cpp-package/include/mxnet-cpp/kvstore.hpp @@ -82,11 +82,6 @@ inline void KVStore::RunServer() { CHECK_EQ(MXKVStoreRunServer(get_kvstore()->get_handle(), &Controller, 0), 0); } -inline void KVStore::SetGradientCompression(const std::string& compress_type, const float threshold) { - CHECK_EQ(MXKVStoreSetCompress(get_kvstore()->get_handle(), - compress_type.c_str(), threshold), 0); -} - inline void KVStore::Init(int key, const NDArray& val) { NDArrayHandle val_handle = val.GetHandle(); CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), 1, &key, &val_handle), 0); diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 51a1abec7c48..99453c3094ae 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -114,7 +114,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - + kv.set_compress({'compress':'2bit', 'pos_threshold':4.0, 'neg_threshold':-4.0}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/example/image-classification/symbols/mlp.py b/example/image-classification/symbols/mlp.py index 4b190b29db9e..0aaa38c44a32 100644 --- a/example/image-classification/symbols/mlp.py +++ b/example/image-classification/symbols/mlp.py @@ -23,10 +23,12 @@ def get_symbol(num_classes=10, **kwargs): data = mx.symbol.Variable('data') data = mx.sym.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1500) act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1500) act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) - mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1500) + act3 = mx.symbol.Activation(data = fc3, name='relu3', act_type="relu") + fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=3000) + mlp = mx.symbol.SoftmaxOutput(data = fc4, name = 'softmax') return mlp From 96294107ddf3b4253bb4e348f480af73d60e1077 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 25 Oct 2017 16:03:16 -0700 Subject: [PATCH 138/237] merge all quantize, and new type in dist server --- include/mxnet/kvstore.h | 4 +- include/mxnet/ndarray.h | 4 +- src/kvstore/comm.h | 7 +- src/kvstore/kvstore_dist.h | 180 +++++++++++++------- src/kvstore/kvstore_dist_server.h | 109 ++++++++---- src/ndarray/ndarray.cc | 129 +++++++------- src/ndarray/ndarray_function.cc | 9 +- src/ndarray/ndarray_function.cu | 8 +- src/ndarray/ndarray_function.h | 4 +- src/operator/contrib/two_bit_quantize-inl.h | 147 +++++++--------- tests/nightly/dist_sync_kvstore.py | 2 +- 11 files changed, 340 insertions(+), 263 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 9d3124a83d6a..75fd49c19bb2 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -405,12 +405,12 @@ class KVStore { /** * \brief positive threshold for 2bit compression */ - float pos_threshold_ = 0.1; + float pos_threshold_ = 0.5; /** * \brief negative threshold for 2bit compression */ - float neg_threshold_ = -0.1; + float neg_threshold_ = -0.5; /** * \brief whether to do barrier when finalize diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 9fed78f0854c..7e0a4663ed56 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -34,6 +34,7 @@ #include #include #include +#include "ps/ps.h" #include "./base.h" #include "./storage.h" #include "./engine.h" @@ -916,7 +917,8 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::st * \param compress type of compression * \param priority Priority of the action. */ -void Dequantize(const NDArray &from, NDArray *to, const std::string& compress, int priority); +void Dequantize(const NDArray &from, NDArray *to, int original_size, + const float neg_threshold, const float pos_threshold, const std::string& compress, int priority); /*! * \brief issue an copy operation from one NDArray to another diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 3058bdb95709..aa40c59c9125 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -578,15 +578,16 @@ class CommDevice : public Comm { // this is done even if the data is on same context as copy_buf because // we don't want the training to be biased towards data on this GPU if (compress_ == "2bit") { - Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, - neg_threshold_, pos_threshold_, priority); +// Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, +// neg_threshold_, pos_threshold_, priority); if (buf.small_send_buf[i].ctx() != buf.small_recv_buf[i].ctx()) { CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); } else { // avoid memory copy when they are on same context buf.small_recv_buf[i] = buf.small_send_buf[i]; } - Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); + // TODO (undo comment) +// Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); } else { LOG(FATAL) << "Unsupported type of compression " << compress_; } diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 8c3b069f05bf..749d3a4d4c91 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -302,6 +302,7 @@ class KVStoreDist : public KVStoreLocal { auto &small_buf = compr_buf_[key]; auto &res_buf = residual_[key]; size_t original_size = comm_buf.shape().Size(); + // returns push_pskv PSKV &pskv = EncodeCompressedKey(key, original_size, true); // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { @@ -318,7 +319,13 @@ class KVStoreDist : public KVStoreLocal { } if (compress_ == "2bit") { - Compress(comm_buf, &small_buf, &res_buf, pskv, priority); + mu_.lock(); + PSKV& pull_pskv = pull_ps_kv_[key]; + mu_.unlock(); + +// Compress(comm_buf, &small_buf, &res_buf, pskv, priority); + QuantizeAll(comm_buf, &small_buf, &res_buf, pskv.lens, pull_pskv.lens, + compress_, neg_threshold_, pos_threshold_, priority); } else { LOG(FATAL) << "Unsupported quantization"; } @@ -493,7 +500,7 @@ class KVStoreDist : public KVStoreLocal { // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, kCompressedPushPull, [cb]() { cb(); }); }; // acquire locks on both comm_buf and small_buf so that pull (which uses comm_buf) // for the same key waits till push finishes @@ -507,47 +514,95 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistCompressedPush")); } - /* - * \brief Compresses data by dividing original data into a part for each server, then - * quantizing each of these data blocks. The sizes of these parts come from pskv. - */ - void Compress(const NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, - const PSKV& pskv, int priority) { - size_t orig_size = comm_buf.shape().Size(); - // to allow indexing parts for each server - NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(int64_t) orig_size}); - - if (compress_ == "2bit") { - // should be start of data in original commbuf - size_t cur_from = 0; - // should be start of meta in new small_buf - size_t cur_to = 0; - for (size_t i = 0; i < pskv.keys.size(); i++) { - NDArray part_compr = small_buf->Slice(cur_to, cur_to + pskv.lens[i]); - - // removing the 3 values from pskv length which are meta data - // end_part_data represents end of original data for this part - size_t end_part_data = cur_from + (pskv.lens[i] - 3) * 16; - // don't exceed original size - if (end_part_data > orig_size) { - end_part_data = orig_size; - } - NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); - NDArray part_res = res_buf->Slice(cur_from, end_part_data); - - Quantize(part_data, &part_compr, &part_res, compress_, - neg_threshold_, pos_threshold_, priority); - - cur_from = end_part_data; - cur_to = cur_to + pskv.lens[i]; +// /* +// * \brief Compresses data by dividing original data into a part for each server, then +// * quantizing each of these data blocks. The sizes of these parts come from pskv. +// */ +// void Compress(const NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, +// const PSKV& pskv, int priority) { +// size_t orig_size = comm_buf.shape().Size(); +// // to allow indexing parts for each server +// NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(int64_t) orig_size}); +// +// if (compress_ == "2bit") { +// // should be start of data in original commbuf +// size_t cur_from = 0; +// // should be start of meta in new small_buf +// size_t cur_to = 0; +// for (size_t i = 0; i < pskv.keys.size(); i++) { +// NDArray part_compr = small_buf->Slice(cur_to, cur_to + pskv.lens[i]); +// +// // removing the 3 values from pskv length which are meta data +// // end_part_data represents end of original data for this part +// size_t end_part_data = cur_from + (pskv.lens[i] - 3) * 16; +// // don't exceed original size +// if (end_part_data > orig_size) { +// end_part_data = orig_size; +// } +// NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); +// NDArray part_res = res_buf->Slice(cur_from, end_part_data); +// +// Quantize(part_data, &part_compr, &part_res, compress_, +// neg_threshold_, pos_threshold_, priority); +// +// cur_from = end_part_data; +// cur_to = cur_to + pskv.lens[i]; +// } +// CHECK_EQ(cur_from, orig_size); +// CHECK_EQ(cur_to, small_buf->shape().Size()); +// } else { +// LOG(FATAL) << "Unsupported compression type"; +// } +// } + + void QuantizeAll(const NDArray &from, NDArray *to, NDArray *residual, + ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, + const std::string& compress, const float neg_threshold, const float pos_threshold, + int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + if (a == cpu::kDevMask && b == cpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([from, to, residual, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + push_pskv_lens, pull_pskv_lens, + neg_threshold, pos_threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { + LOG(FATAL) << "Unsupported Quantization"; } - CHECK_EQ(cur_from, orig_size); - CHECK_EQ(cur_to, small_buf->shape().Size()); } else { - LOG(FATAL) << "Unsupported compression type"; +#if MXNET_USE_CUDA + if (a == gpu::kDevMask && b == gpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([from, to, residual, pull_pskv_lens, push_pskv_lens, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + push_pskv_lens, pull_pskv_lens, + neg_threshold, pos_threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "Unsupported Quantization"; + } + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif } } + + PSKV& EncodeKey(int key, size_t size, bool is_push) { if (compress_ != "none") { return EncodeCompressedKey(key, size, is_push); @@ -573,19 +628,8 @@ class KVStoreDist : public KVStoreLocal { } // represents size of data to be sent - size_t compr_size = 0; - - // adds 3 values to both cases for meta info - if (original_size >= bigarray_bound_) { - // if size of data is not divisible by bits, then we need an extra float - // to store the last few values - compr_size = num_servers * ((original_size/num_servers) % bits == 0 ? - (original_size/num_servers)/bits + 3 : - (original_size/num_servers)/bits + 4); - } else { - compr_size = original_size % bits == 0 ? - original_size / bits + 3: original_size / bits + 4; - } + size_t compr_size = original_size % bits == 0 ? + original_size / bits: original_size / bits + 1; mu_.lock(); PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; @@ -609,6 +653,10 @@ class KVStoreDist : public KVStoreLocal { int server = (key * 9973) % num_servers; ps::Key ps_key = krs[server].begin() + key; CHECK_LT(ps_key, krs[server].end()); + // meta info + push_pskv.keys.push_back(krs[server].begin() + original_size); + push_pskv.lens.push_back(0); + // data push_pskv.keys.push_back(ps_key); pull_pskv.keys.push_back(ps_key); push_pskv.lens.push_back(compr_size); @@ -619,32 +667,40 @@ class KVStoreDist : public KVStoreLocal { // partition it to all servers push_pskv.size = 0; pull_pskv.size = 0; + for (int i = 0; i < num_servers; ++i) { - size_t part_orig = - static_cast (round(static_cast(original_size)/num_servers*(i+1))) - - static_cast (round(static_cast(original_size)/num_servers*(i))); - // if block was rounded up to beyond size of our data, set it to end of data - if (part_orig + pskv.size > original_size) { - part_orig = original_size - pskv.size; + size_t part_compr, part_orig; + if(i==num_servers-1){ + part_compr = compr_size - push_pskv.size; + part_orig = original_size - pull_pskv.size; + } else { + part_compr = static_cast (round(static_cast(compr_size)/num_servers*(i+1))) - + static_cast (round(static_cast(compr_size)/num_servers*(i))); + part_orig = part_compr * bits; } - // TODO(huilgolr) specific to 2bit compression. generalize - size_t compr_split = (part_orig % bits == 0)? part_orig/bits + 3 : part_orig/bits + 4; + // meta info + ps::Key ps_key_dummy = krs[i].begin() + part_orig; + CHECK_LT(ps_key_dummy, krs[i].end()); + push_pskv.keys.push_back(ps_key_dummy); + push_pskv.lens.push_back(0); + + // data ps::Key ps_key = krs[i].begin() + key; CHECK_LT(ps_key, krs[i].end()); push_pskv.keys.push_back(ps_key); pull_pskv.keys.push_back(ps_key); // push_pskv stores lengths of compressed blocks - push_pskv.lens.push_back(compr_split); + push_pskv.lens.push_back(part_compr); // pull_pskv stores lengths of original data pull_pskv.lens.push_back(part_orig); - push_pskv.size += compr_split; + push_pskv.size += part_compr; pull_pskv.size += part_orig; } CHECK_EQ(static_cast(push_pskv.size), compr_size); CHECK_EQ(static_cast(pull_pskv.size), original_size); - CHECK_EQ(push_pskv.lens.size(), num_servers); + CHECK_EQ(push_pskv.lens.size(), num_servers+1); } } return pskv; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index a677806ed47c..ad2f82175bfa 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -42,6 +42,7 @@ namespace kvstore { static const int kRowSparsePushPull = 1; static const int kDefaultPushPull = 0; +static const int kCompressedPushPull = 3; static const int kStopServer = -1; static const int kSyncMode = -2; static const int kSetCompress = 2; @@ -170,6 +171,8 @@ class KVStoreDistServer { ps::KVServer* server) { if (req_meta.cmd == kRowSparsePushPull) { DataHandleRowSparse(req_meta, req_data, server); + } else if (req_meta.cmd == kCompressedPushPull) { + DataHandleCompressed(req_meta, req_data, server); } else { DataHandleDefault(req_meta, req_data, server); } @@ -362,6 +365,72 @@ class KVStoreDistServer { } } + void DataHandleCompressed(const ps::KVMeta& req_meta, + const ps::KVPairs &req_data, + ps::KVServer* server) { + CHECK_EQ(req_meta.cmd, kCompressedPushPull); + // do some check + if (req_meta.push) { + // there used several WaitToRead, this is because \a recved's memory + // could be deallocated when this function returns. so we need to make sure + // the operators with \a NDArray are actually finished + CHECK_EQ(req_data.lens.size(), (size_t)2); + CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[1]); + + int original_size = DecodeKey(req_data.keys[0]); + int key = DecodeKey(req_data.keys[1]); + auto& stored = store_[key]; + + size_t ds[] = {(size_t)req_data.lens[1]}; + TShape dshape(ds, ds + 1); + TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) + dshape, cpu::kDevMask); + NDArray recved = NDArray(recv_blob, 0); + + NDArray decomp_buf = decomp_buf_[key]; + dshape = TShape{(int64_t) original_size}; + + // TODO(huilgolr) check and merge with init of stored + if (decomp_buf.is_none()) { + decomp_buf = NDArray(dshape, Context()); + } + + if (stored.is_none()) { + // initialization + stored = NDArray(dshape, Context()); + Dequantize(recved, &stored, original_size, neg_threshold, pos_threshold, compress_, 0); + server->Response(req_meta); + stored.WaitToRead(); + } else if (sync_mode_) { + // synced push + auto& merged = merge_buf_[key]; + if (merged.array.is_none()) { + merged.array = NDArray(dshape, Context()); + } + if (merged.request.size() == 0) { + Dequantize(recved, &merged.array, original_size, neg_threshold, pos_threshold, compress_, 0); + } else { + Dequantize(recved, &decomp_buf, original_size, neg_threshold, pos_threshold, compress_, 0); + merged.array += decomp_buf; + } + merged.request.push_back(req_meta); + ApplyUpdates(key, &merged, &stored, server); + } else { + // async push + Dequantize(recved, &decomp_buf, original_size, neg_threshold, pos_threshold, compress_, 0); + exec_.Exec([this, key, &decomp_buf, &stored]() { + CHECK(updater_); + updater_(key, decomp_buf, &stored); + }); + server->Response(req_meta); + stored.WaitToRead(); + } + } else { // pull + // never used + DataHandleDefault(req_meta, req_data, server); + } + } + void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { @@ -385,29 +454,10 @@ class KVStoreDistServer { TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); - NDArray decomp_buf = decomp_buf_[key]; - if (compress_ != "none") { - if (compress_ == "2bit") { - int64_t original_size = (int64_t) (*(recv_blob.dptr()+2)); - // changing dshape to original size as value is stored in - // original size even when compressed data is received - dshape = TShape{original_size}; - } else { - LOG(FATAL) << "Unsupported compression type"; - } - // TODO(huilgolr) check and merge with init of stored - if (decomp_buf.is_none()) { - decomp_buf = NDArray(dshape, Context()); - } - } if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); - if (compress_ == "none") { CopyFromTo(recved, &stored, 0); - } else { - Dequantize(recved, &stored, compress_, 0); - } server->Response(req_meta); stored.WaitToRead(); } else if (sync_mode_) { @@ -417,35 +467,18 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - if (compress_ == "none") { CopyFromTo(recved, &merged.array, 0); - } else { - Dequantize(recved, &merged.array, compress_, 0); - } } else { - if (compress_ == "none") { merged.array += recved; - } else { - Dequantize(recved, &decomp_buf, compress_, 0); - merged.array += decomp_buf; - } } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); } else { // async push - if (compress_ == "none") { exec_.Exec([this, key, &recved, &stored]() { CHECK(updater_); updater_(key, recved, &stored); }); - } else { - Dequantize(recved, &decomp_buf, compress_, 0); - exec_.Exec([this, key, &decomp_buf, &stored]() { - CHECK(updater_); - updater_(key, decomp_buf, &stored); - }); - } server->Response(req_meta); stored.WaitToRead(); } @@ -467,6 +500,7 @@ class KVStoreDistServer { return key - kr.begin(); } + /** * \brief user defined mode for push */ @@ -504,6 +538,9 @@ class KVStoreDistServer { * by worker by sending a command to server */ std::string compress_ = "none"; + + float pos_threshold = 0.5; + float neg_threshold = -0.5; }; } // namespace kvstore diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index e6559bd091cd..466b9690caa4 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -557,73 +557,70 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { #endif } } - -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, - const float neg_threshold, const float pos_threshold, - int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - NDArray ret = *to; - NDArray res = *residual; - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - if (a == cpu::kDevMask && b == cpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs(3); - inputs[0] = from.data(); - inputs[1] = res.data(); - inputs[2] = ret.data(); - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - }, from.ctx(), {from.var()}, {ret.var(), res.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { -#if MXNET_USE_CUDA - if (a == gpu::kDevMask && b == gpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs(3); - inputs[0] = from.data(); - inputs[1] = res.data(); - inputs[2] = ret.data(); - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {ret.var(), res.var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { - LOG(FATAL) << "unknown device mask"; - } -#else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; -#endif - } -} - -void Dequantize(const NDArray &from, NDArray *to, const std::string& compress, int priority) { +// +//void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, +// const float neg_threshold, const float pos_threshold, +// int priority) { +// CHECK(from.shape().ndim() != 0) +// << "source operands have zero dimension shape"; +// // important: callback must always capture by value +// NDArray ret = *to; +// NDArray res = *residual; +// int a = from.ctx().dev_mask(); +// int b = to->ctx().dev_mask(); +// if (a == cpu::kDevMask && b == cpu::kDevMask) { +// if (compress == "2bit") { +// Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { +// std::vector inputs(3); +// inputs[0] = from.data(); +// inputs[1] = res.data(); +// inputs[2] = ret.data(); +// mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, +// neg_threshold, pos_threshold); +// }, from.ctx(), {from.var()}, {ret.var(), res.var()}, +// FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); +// } else { +// LOG(FATAL) << "Unsupported Quantization"; +// } +// } else { +//#if MXNET_USE_CUDA +// if (a == gpu::kDevMask && b == gpu::kDevMask) { +// if (compress == "2bit") { +// Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { +// std::vector inputs(3); +// inputs[0] = from.data(); +// inputs[1] = res.data(); +// inputs[2] = ret.data(); +// mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, +// neg_threshold, pos_threshold); +// // Wait GPU kernel to complete +// ctx.get_stream()->Wait(); +// }, from.ctx(), {from.var()}, {ret.var(), res.var()}, +// FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); +// } else { +// LOG(FATAL) << "Unsupported Quantization"; +// } +// } else { +// LOG(FATAL) << "unknown device mask"; +// } +//#else +// LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +//#endif +// } +// } + +void Dequantize(const NDArray &from, NDArray *to, int original_size, + const float neg_threshold, const float pos_threshold, const std::string& compress, int priority) { CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; - // important: callback must always capture by value - NDArray ret = *to; int a = from.ctx().dev_mask(); int b = to->ctx().dev_mask(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, ret](RunContext ctx) { - std::vector inputs(2); - inputs[0] = from.data(); - inputs[1] = ret.data(); - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); - }, from.ctx(), {from.var()}, {ret.var()}, + Engine::Get()->PushSync([from, to, original_size, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, original_size, neg_threshold, pos_threshold); + }, from.ctx(), {from.var()}, {to->var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; @@ -632,14 +629,12 @@ void Dequantize(const NDArray &from, NDArray *to, const std::string& compress, i #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, ret](RunContext ctx) { - std::vector inputs(2); - inputs[0] = from.data(); - inputs[1] = ret.data(); - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs); + Engine::Get()->PushSync([from, ret, original_size, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, original_size, neg_threshold, pos_threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {ret.var()}, + }, from.ctx(), {from.var()}, {to->var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 4f4f67306f9f..19e5cb1962c4 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -23,6 +23,7 @@ */ // this will be invoked by gcc and compile CPU version +#include #include "./ndarray_function.h" #include "./ndarray_function-inl.h" #include "../common/utils.h" @@ -188,8 +189,9 @@ void ElementwiseSum(mshadow::Stream* s, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s, inputs); +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, + const float neg_threshold, const float pos_threshold) { + mxnet::op::Dequantize2BitImpl(s, inputs, original_size, neg_threshold, pos_threshold); } /* @@ -197,8 +199,9 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 6c2924881a56..56dd90ca19d1 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -207,8 +207,9 @@ void ElementwiseSum(mshadow::Stream* s, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs) { - mxnet::op::Dequantize2BitImpl(s, inputs); +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, + const float neg_threshold, const float pos_threshold) { + mxnet::op::Dequantize2BitImpl(s, inputs, original_size, neg_threshold, pos_threshold); } /* @@ -216,8 +217,9 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index 37c22d1dd4f7..a76a2f8b927e 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -168,13 +168,15 @@ void Copy(const TBlob &from, TBlob *to, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs); +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, + const float neg_threshold, const float pos_threshold); /* * \brief Enables use of function defined under Quantize2Bit operator for an ndarray */ template void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, + ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold); template diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index dcbb7f8ac19b..5d5f35832794 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -31,6 +31,7 @@ #include "../mshadow_op.h" #include "../mxnet_op.h" #include +#include "ps/ps.h" namespace mxnet { namespace op { @@ -107,42 +108,53 @@ inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, } struct init_threshold_2bit { - MSHADOW_XINLINE static void Map(int i, + MSHADOW_XINLINE static void Map(int server_id, float *out, const float neg_threshold, const float pos_threshold, - int size) { + ps::SArray compr_sizes, + ps::SArray orig_sizes) { + // i for each server + size_t curr_pos = 0; + for (int i=0; i(size); + out[curr_pos+2] = static_cast(orig_sizes[server_id]); } }; struct quantize_2bit { - MSHADOW_XINLINE static void Map(int block_id, - int grad_size, + MSHADOW_XINLINE static void Map(int out_block_id, +// std::unordered_set meta_pos, +// std::vector cumulative_part_indices, +// ps::SArray compr_sizes, +// ps::SArray orig_sizes, + int original_size, float *out, float *grad, float *residual, const float neg_threshold, const float pos_threshold) { - float* compr_block = out + block_id; + float* compr_block = out + out_block_id; // init to 0 *compr_block = 0; // start and end are indices in original grad array - int start = block_id << 4; - int end = (start + 16 <= grad_size) ? start + 16 : grad_size; + int start = out_block_id << 4; + int end = start + 16; // <= original_size) ? start + 16 : original_size; char* block_ptr = reinterpret_cast < char* > (compr_block); const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - char* curr_byte = block_ptr; - for (int i = start; i < end; i++) { + char* curr_byte = block_ptr; + for (int i = start; i < end && i < original_size; i++) { // // adds 1 when i-start divisible by 4 - curr_byte += ((i-start) & 3); + curr_byte += ((i - start) & 3); residual[i] += grad[i]; if (residual[i] >= pos_threshold) { residual[i] -= pos_threshold; @@ -159,16 +171,33 @@ struct quantize_2bit { template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { // Init threshold and original size - mxnet_op::Kernel::Launch(s, 1, - inputs[2].dptr(), // compressed array - neg_threshold, pos_threshold, - inputs[0].Size()); - // Finally, compress the data and calculate new residual - mxnet_op::Kernel::Launch(s, (inputs[2].Size()-3), - inputs[0].Size(), // original grad size - inputs[2].dptr()+3, // compressed array +// mxnet_op::Kernel::Launch(s, push_pskv_lens.size(), +// inputs[2].dptr(), // compressed array (concat for all servers) +// neg_threshold, pos_threshold, +// push_pskv_lens, pull_pskv_lens); + +// std::unordered_set meta_pos; +// std::vector cumulative_part_indices; +// int cur_pos = 0; +// int cum_index = 0; +// for(int i=0; i::Launch(s, inputs[2].Size(), // compressed array + inputs[0].Size(), +// meta_pos, cumulative_part_indices, +// push_pskv_lens, // compressed sizes +// pull_pskv_lens, // original sizes + inputs[2].dptr(), // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array neg_threshold, // negative threshold @@ -184,7 +213,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); +// Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -229,8 +258,8 @@ struct dequantize_2bit { MSHADOW_XINLINE static void Map(int i, float *out, float *in, - float *neg_threshold, - float *pos_threshold) { + const float neg_threshold, + const float pos_threshold) { // get row ptr char* ch_ptr = (reinterpret_cast(in + (i >> 4))) + ((i & 15) >> 2); @@ -239,76 +268,26 @@ struct dequantize_2bit { int col = (i & 15) & 3; if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { - out[i] = *pos_threshold; + out[i] = pos_threshold; } // use posbits for mask as posbits are 11 // compare with negbits else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { - out[i] = *neg_threshold; + out[i] = neg_threshold; } else { out[i] = 0; } -// -// // get column id -// // Decompress -// switch ((i & 15) & 3) { -// case 0: -// // positve -// if (((*ch_ptr) & (0xc0)) == 0xc0) { // binary: (11)00 0000 -// out[i] = *pos_threshold; -// // negative -// } else if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 -// out[i] = *neg_threshold; -// } else { // 0 -// out[i] = 0; -// } -// break; -// case 1: -// // positve -// if (((*ch_ptr) & (0x30)) == 0x30) { // binary: 00(11) 0000 -// out[i] = *pos_threshold; -// // negative -// } else if ( ((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 -// out[i] = *neg_threshold; -// } else { // 0 -// out[i] = 0; -// } -// break; -// case 2: -// // positve -// if ( ((*ch_ptr) & (0x0c)) == 0x0c) { // binary: 0000 (11)00 -// out[i] = *pos_threshold; -// // negative -// } else if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 0000 (10)00 -// out[i] = *neg_threshold; -// } else { // 0 -// out[i] = 0; -// } -// break; -// case 3: -// // positve -// if (((*ch_ptr) & (0x03))== 0x03) { -// out[i] = *pos_threshold; -// // negative -// } else if (((*ch_ptr) & (0x03)) == 0x02) { -// out[i] = *neg_threshold; -// } else { // 0 -// out[i] = 0; -// } -// break; -// default: -// break; -// } } }; template -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, int original_size, + const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data - mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr()+3, // compressed array - inputs[0].dptr(), // negative threshold - inputs[0].dptr()+1); // positive threshold + mxnet_op::Kernel::Launch(s, original_size, // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + neg_threshold, // negative threshold + pos_threshold); // positive threshold } template @@ -318,7 +297,7 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); - Dequantize2BitImpl(s, inputs); +// Dequantize2BitImpl(s, inputs); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index a88e9922ad4f..1f05f96a1901 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -294,5 +294,5 @@ def check_init(kv, cur_keys, cur_shape, device=False): print('worker ' + str(my_rank) + ' is initialized') if __name__ == "__main__": - test_sync_init() + # test_sync_init() test_sync_push_pull() From 0feabd5788fe49cdea0743d8612075bf37eda10c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 25 Oct 2017 17:38:18 -0700 Subject: [PATCH 139/237] fix ndarray dequantize --- src/ndarray/ndarray.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 466b9690caa4..df40ec048d14 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -629,7 +629,7 @@ void Dequantize(const NDArray &from, NDArray *to, int original_size, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, ret, original_size, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, to, original_size, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs = {from.data(), to->data()}; mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, original_size, neg_threshold, pos_threshold); // Wait GPU kernel to complete From d3e4df8ba82ec68b5de2d2be8911182d7755d5c4 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 26 Oct 2017 00:46:27 +0000 Subject: [PATCH 140/237] debug stuff --- example/image-classification/common/fit.py | 4 +++- tools/bandwidth/measure.py | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 99453c3094ae..b82a147c88d9 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -103,6 +103,8 @@ def add_fit_args(parser): help='1 means test reading speed without training') train.add_argument('--dtype', type=str, default='float32', help='precision: float32 or float16') + train.add_argument('--compress', type=str, default='none') + train.add_argument('--threshold', type=float, default=0.5) return train def fit(args, network, data_loader, **kwargs): @@ -114,7 +116,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_compress({'compress':'2bit', 'pos_threshold':4.0, 'neg_threshold':-4.0}) + kv.set_compress({'compress':args.compress, 'pos_threshold':args.threshold, 'neg_threshold':-1*args.threshold}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index 66ef7371f11e..d9c9fbe930a1 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -53,6 +53,7 @@ def parse_args(): help='number of classes') parser.add_argument('--optimizer', type=str, default='None', help='the optimizer set to kvstore. None means no optimizer') + parser.add_argument('--compress', type=str, default='none') args = parser.parse_args() logging.info(args) return args @@ -72,10 +73,11 @@ def error(gpu_res, cpu_res): return res def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, - num_batches, test_results, **kwargs): + num_batches, test_results, compress, **kwargs): # create kvstore and optimizer devs = [mx.gpu(int(i)) for i in gpus.split(',')] kv = mx.kv.create(kv_store) + kv.set_compress({'compress':compress,'pos_threshold':0.5, 'neg_threshold':-0.5}) if optimizer is None or optimizer == 'None': opt = None else: @@ -89,6 +91,9 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, data_shape = (32,) + tuple([int(s) for s in image_shape.split(',')]) shapes = get_shapes(symbol, data_shape) + + for s in shapes: + print(s) size = float(sum([reduce(lambda x,y : x*y, s, 1) for s in shapes])) * 4 / 1e6 logging.info('num of arrays = %d, total size = %f MB' % (len(shapes), size)) From d6801dd7df91c852ed204f1fba2d1bacc01a6522 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 25 Oct 2017 17:48:10 -0700 Subject: [PATCH 141/237] fix bug --- src/kvstore/kvstore_dist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 749d3a4d4c91..7ef11cf774dc 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -700,7 +700,7 @@ class KVStoreDist : public KVStoreLocal { } CHECK_EQ(static_cast(push_pskv.size), compr_size); CHECK_EQ(static_cast(pull_pskv.size), original_size); - CHECK_EQ(push_pskv.lens.size(), num_servers+1); + CHECK_EQ(push_pskv.lens.size(), num_servers*2); } } return pskv; From e97c4774b3a780d068801fc15e68c4c0c4f15d77 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 25 Oct 2017 18:24:40 -0700 Subject: [PATCH 142/237] trying merge dequntize --- src/operator/contrib/two_bit_quantize-inl.h | 52 +++++++++++++++------ 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 5d5f35832794..3f3e7f75a4ec 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -255,27 +255,50 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, struct dequantize_2bit { // Decompress - MSHADOW_XINLINE static void Map(int i, + MSHADOW_XINLINE static void Map(int compr_block_id, + int original_size, float *out, float *in, const float neg_threshold, const float pos_threshold) { - // get row ptr - char* ch_ptr = (reinterpret_cast(in + (i >> 4))) + ((i & 15) >> 2); + int out_start_id = compr_block_id<<4; + float* outval = out + out_start_id; + char* ch_ptr = reinterpret_cast(in + compr_block_id); const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - - int col = (i & 15) & 3; - if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { - out[i] = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { - out[i] = neg_threshold; - } else { - out[i] = 0; + for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { + ch_ptr += !(i & 3); + int col = i & 3; + if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { + *outval = neg_threshold; + } else { + *outval = 0; + } } + + + // get row ptr +// char* ch_ptr = (reinterpret_cast(in + (i >> 4))); +// for (int i=0 ) +// + ((i & 15) >> 2); +// const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; +// const int negbits[] = {0x80, 0x20, 0x08, 0x02}; +// +// int col = (i & 15) & 3; +// if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { +// out[i] = pos_threshold; +// } // use posbits for mask as posbits are 11 +// // compare with negbits +// else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { +// out[i] = neg_threshold; +// } else { +// out[i] = 0; +// } } }; @@ -283,7 +306,8 @@ template void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, int original_size, const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data - mxnet_op::Kernel::Launch(s, original_size, // original size + mxnet_op::Kernel::Launch(s, original_size/16, // original size + original_size, inputs[1].dptr(), // out array inputs[0].dptr(), // compressed array neg_threshold, // negative threshold From 18df71ef8832d4c4568e3bbe6f8e0265f0d047d9 Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Wed, 25 Oct 2017 20:35:03 -0700 Subject: [PATCH 143/237] Frmework and validation tests for operator validation and performance-testing in C++ Normally used for gtest tests. --- src/c_api/c_api_ndarray.cc | 56 +- src/imperative/imperative_utils.h | 79 ++- src/operator/elemwise_op_common.h | 3 +- tests/cpp/include/test_compute.h | 98 +++ tests/cpp/include/test_core_op.h | 555 +++++++++++++++ tests/cpp/include/test_legacy_op.h | 551 ++++++++++++++ tests/cpp/include/test_op.h | 557 ++------------- tests/cpp/include/test_op_runner.h | 72 +- tests/cpp/include/test_util.h | 2 +- tests/cpp/operator/activation_perf.cc | 11 +- tests/cpp/operator/batchnorm_test.cc | 827 ++++++++++++---------- tests/cpp/operator/core_op_runner_test.cc | 271 +++++++ tests/cpp/operator/fully_conn_perf.cc | 31 +- 13 files changed, 2097 insertions(+), 1016 deletions(-) create mode 100644 tests/cpp/include/test_compute.h create mode 100644 tests/cpp/include/test_core_op.h create mode 100644 tests/cpp/include/test_legacy_op.h create mode 100644 tests/cpp/operator/core_op_runner_test.cc diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index 88f14ab04274..2c4a30501147 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -35,59 +35,10 @@ #include "./c_api_common.h" #include "../common/utils.h" #include "../common/exec_utils.h" +#include "../imperative/imperative_utils.h" using namespace mxnet; -nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op, - const int& num_inputs, - const int& num_params, - const char **param_keys, - const char **param_vals) { - static auto& num_args = nnvm::Op::GetAttr("key_var_num_args"); - - nnvm::NodeAttrs attrs; - attrs.op = op; - attrs.dict.reserve(num_params+1); - for (int i = 0; i < num_params; ++i) { - attrs.dict.emplace(param_keys[i], param_vals[i]); - } - if (num_args.count(op)) { - attrs.dict.emplace(num_args[op], std::to_string(num_inputs)); - } - if (op->attr_parser != nullptr) { - op->attr_parser(&attrs); - } - - return attrs; -} - -void SetNumOutputs(const nnvm::Op *op, - const nnvm::NodeAttrs& attrs, - const int& num_inputs, - int* infered_num_outputs, - int* num_visible_outputs) { - static auto& visible_out = nnvm::Op::GetAttr("FNumVisibleOutputs"); - int infered_num_inputs; - if (op->get_num_inputs != nullptr) { - infered_num_inputs = op->get_num_inputs(attrs); - } else { - infered_num_inputs = op->num_inputs; - } - CHECK_EQ(num_inputs, infered_num_inputs) - << "Operator " << op->name << " expects " << infered_num_inputs - << " inputs, but got " << num_inputs << " instead."; - if (op->get_num_outputs != nullptr) { - *infered_num_outputs = op->get_num_outputs(attrs); - } else { - *infered_num_outputs = op->num_outputs; - } - *num_visible_outputs = *infered_num_outputs; - if (visible_out.count(op)) { - *num_visible_outputs = visible_out[op](attrs); - CHECK_LE(*num_visible_outputs, *infered_num_outputs); - } -} - void SetNDInputsOutputs(const nnvm::Op* op, std::vector* ndinputs, std::vector* ndoutputs, @@ -137,11 +88,12 @@ void MXImperativeInvokeImpl(AtomicSymbolCreator creator, const nnvm::Op* op = static_cast(creator); MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get(); - nnvm::NodeAttrs attrs = ParseAttrs(op, num_inputs, num_params, param_keys, param_vals); + nnvm::NodeAttrs attrs = imperative::ParseAttrs(op, num_inputs, num_params, + param_keys, param_vals); int infered_num_outputs; int num_visible_outputs; - SetNumOutputs(op, attrs, num_inputs, &infered_num_outputs, &num_visible_outputs); + imperative::SetNumOutputs(op, attrs, num_inputs, &infered_num_outputs, &num_visible_outputs); std::vector ndinputs, ndoutputs; SetNDInputsOutputs(op, &ndinputs, &ndoutputs, num_inputs, inputs, diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 3758b47ec2a6..640476b6a8bc 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -80,10 +80,10 @@ inline Context GetContext(const nnvm::NodeAttrs& attrs, // Set the shape, dtype, storage type and dispatch mode via the attribute inference functions inline void SetShapeType(const Context& ctx, - const nnvm::NodeAttrs& attrs, - const std::vector& inputs, - const std::vector& outputs, - DispatchMode* dispatch_mode) { + const nnvm::NodeAttrs& attrs, + const std::vector& inputs, + const std::vector& outputs, + DispatchMode* dispatch_mode) { static auto& infershape = nnvm::Op::GetAttr("FInferShape"); static auto& infertype = nnvm::Op::GetAttr("FInferType"); static auto& inferstorage = nnvm::Op::GetAttr("FInferStorageType"); @@ -230,8 +230,8 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs, } inline void SetWriteInplaceReq(const std::vector& inputs, - const std::vector& outputs, - std::vector *req) { + const std::vector& outputs, + std::vector *req) { std::unordered_set in_vars; in_vars.reserve(inputs.size()); for (auto &i : inputs) { @@ -247,6 +247,73 @@ inline void SetWriteInplaceReq(const std::vector& inputs, } } +/*! + * \brief Parse parameter attributes into a nnvm::NodeAttrs structure + * \param op Pointer to the nnvm Operator object + * \param num_inputs Number of operator inputs + * \param num_params Number of parameters + * \param param_keys Array of string pointers representing the parameter keys + * \param param_vals Array of string pointers representing the associated values + * \return nnvm::NodeAttrs structure representing the parsed attributes + */ +inline nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op, + const int num_inputs, + const int num_params, + const char **param_keys, + const char **param_vals) { + static auto& num_args = nnvm::Op::GetAttr("key_var_num_args"); + + nnvm::NodeAttrs attrs; + attrs.op = op; + attrs.dict.reserve(num_params+1); + for (int i = 0; i < num_params; ++i) { + attrs.dict.emplace(param_keys[i], param_vals[i]); + } + if (num_args.count(op)) { + attrs.dict.emplace(num_args[op], std::to_string(num_inputs)); + } + if (op->attr_parser != nullptr) { + op->attr_parser(&attrs); + } + + return attrs; +} + +/*! + * \brief Determine number of outputs for the given operator + * \param op Pointer to the nnvm Operator object + * \param attrs nnvm::NodeAttrs structure representing the operator's attributes + * \param num_inputs Number of inputs tot he operator + * \param infered_num_outputs The inferred number of outputs + * \param num_visible_outputs The actual number of visible outputs + */ +inline void SetNumOutputs(const nnvm::Op *op, + const nnvm::NodeAttrs& attrs, + const int& num_inputs, + int* infered_num_outputs, + int* num_visible_outputs) { + static auto& visible_out = nnvm::Op::GetAttr("FNumVisibleOutputs"); + int infered_num_inputs; + if (op->get_num_inputs != nullptr) { + infered_num_inputs = op->get_num_inputs(attrs); + } else { + infered_num_inputs = op->num_inputs; + } + CHECK_EQ(num_inputs, infered_num_inputs) + << "Operator " << op->name << " expects " << infered_num_inputs + << " inputs, but got " << num_inputs << " instead."; + if (op->get_num_outputs != nullptr) { + *infered_num_outputs = op->get_num_outputs(attrs); + } else { + *infered_num_outputs = op->num_outputs; + } + *num_visible_outputs = *infered_num_outputs; + if (visible_out.count(op)) { + *num_visible_outputs = visible_out[op](attrs); + CHECK_LE(*num_visible_outputs, *infered_num_outputs); + } +} + inline void DerefInputOutput(const std::vector& inputs, const std::vector& outputs, std::vector* p_inputs, diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h index d1ef4d8398a5..8216eacd801e 100644 --- a/src/operator/elemwise_op_common.h +++ b/src/operator/elemwise_op_common.h @@ -227,8 +227,9 @@ struct CloneGradient { std::vector ret; const size_t input_count = n->inputs.size(); ret.reserve(input_count); - for (size_t i = 0; i < input_count; ++i) + for (size_t i = 0; i < input_count; ++i) { ret.emplace_back(ograds[0]); + } return ret; } }; diff --git a/tests/cpp/include/test_compute.h b/tests/cpp/include/test_compute.h new file mode 100644 index 000000000000..0100fde0a3db --- /dev/null +++ b/tests/cpp/include/test_compute.h @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file test_op.h + * \brief operator unit test utility functions + * \author Chris Olivier + * + * These classes offer a framework for developing, testing and debugging operators + * in C++. They work for both CPU and GPU modes, as well as offer a timing + * infrastructure in order to test inidividual operator performance. + * + * Operator data can be validated against general logic, + * stored scalar values (which can be generated by this code from an existing operator via + * BasicOperatorData::dumpC(), as well as against each other (ie check that + * GPU, CPU, MKL, and CUDNN operators produce the same output given the same input. + * + * test_util.h: General testing utility functionality + * test_perf.h: Performance-related classes + * test_op.h: Operator-specific testing classes + */ +#ifndef TEST_COMPUTE_H_ +#define TEST_COMPUTE_H_ + +#include "../../src/common/utils.h" + +namespace mxnet { +namespace test { +namespace op { + +class ComputeOp /*: public Operator*/ { + public: + ComputeOp(const OpContext &ctx, const nnvm::NodeAttrs& attrs) + : attrs_(attrs) + , forward_(nullptr) + , backward_(nullptr) { + // TODO(cjolivier01): Determine backward op from op + //common::GetFCompute(attrs.op, "FCompute", ctx) + } + + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + if(forward_) { + forward_(attrs_, ctx, in_data, req, out_data); + } + } + + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + if(backward_) { + backward_(attrs_, ctx, { in_data }, req, in_grad); + } + } + +// virtual std::string TypeString() const { +// return "ComputeOp"; +// } + + protected: + nnvm::NodeAttrs attrs_; + FCompute forward_; + FCompute backward_; +}; + +template +class ComputeOpProp /*: public OperatorProperty*/ { + +}; + +} // namespace op +} // namespace test +} // namespace mxnet + +#endif // TEST_OP_H_ diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h new file mode 100644 index 000000000000..0ed8b7e717a6 --- /dev/null +++ b/tests/cpp/include/test_core_op.h @@ -0,0 +1,555 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TEST_CORE_OP_H_ +#define TEST_CORE_OP_H_ + +#include "./test_op.h" + +namespace mxnet { +namespace test { +namespace op { + +// Tried making this a struct w/constexpr, but getting undefined reference on gcc 5.4.1 +#define COREOP_FWD_OP_NAME_KEY "fwd_op_name" +#define COREOP_BWD_OP_NAME_KEY "bwd_op_name" + +/*! + * Low-noise operator executor + * @tparam DType Data type for the operator executions + */ +template +class CoreOpExecutor : public test::op::OperatorDataInitializer + , public test::op::OperatorExecutorTiming { + /*! \brief Performance timing categories */ + enum TimingId { + Forward, + Backward + }; + + /*! + * \brief Access data blob as if on the CPU via a callback + * \tparam Type of callback Function to call with CPU-data NDArray + * \param src Source NDArray (on GPU or CPU) + * \param run_ctx Run context + * \param cb Callback Function to call with CPU-data NDArray + */ + template + static inline void AccessAsCPU(const NDArray &src, const RunContext &run_ctx, CallbackFunction cb) { + if(src.ctx().dev_type == Context::kCPU) { + cb(src); + } else { + Context cpu_ctx, gpu_ctx = src.ctx(); + cpu_ctx.dev_type = Context::kCPU; + cpu_ctx.dev_id = 0; + NDArray on_cpu(src.shape(), cpu_ctx); + on_cpu.CheckAndAlloc(); + TBlob tmp1 = on_cpu.data(); + mxnet::ndarray::Copy(src.data(), &tmp1, cpu_ctx, gpu_ctx, run_ctx); + cb(on_cpu); + TBlob tmp2 = src.data(); + mxnet::ndarray::Copy(on_cpu.data(), &tmp2, gpu_ctx, cpu_ctx, run_ctx); + } + } + + /*! + * \brief Parse additional arguments into NodeAttrs structure + * \param op Pointer to operator object + * \param args vector of string pairs representing argument key/value pairs + * \return Constructed NodeAttrs structure + */ + static nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op, const kwargs_t& args) { + const size_t count = args.size(); + std::vector keys, values; + keys.reserve(count); + values.reserve(count); + for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end(); i_iter != e_iter; ++i_iter) { + keys.push_back(i_iter->first.c_str()); + values.push_back(i_iter->second.c_str()); + } + return imperative::ParseAttrs(op, op->num_inputs, count, &keys[0], &values[0]); + } + + /*! + * \brief Return vector of data blobs associated with anm array of NDArray objects + * \param src vector of NDArrays + * \param dest Vector to store pointers to the NDArrays' data blobs + * \return Reference to the supplied vector of TBlob results + */ + static inline std::vector& CollectBlobs(std::vector& src, std::vector *dest) { + dest->reserve(dest->size() + src.size()); + for (size_t i = 0, n = src.size(); i < n; ++i) { + dest->push_back(src[i].data()); + } + return *dest; + } + + /*! + * \brief Create NDArray of random data + * \param shape Shape of the tensor to be created + * \param ctx Context to use when creating the array/tensor + * \return The created NDArray + */ + NDArray CreateRandArray(const TShape& shape, const Context& ctx) const { + CHECK_GT(shape.Size(), 0); // Check it's a valid shape + NDArray array(shape, ctx, true, mshadow::DataType::kFlag); + array.CheckAndAlloc(); + AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) { + test::op::OperatorDataInitializer::FillRandom(arr.data()); + }); + return std::move(array); + } + + /*! + * \brief Create NDArray of zeros + * \param shape Shape of the tensor to be created + * \param ctx Context to use when creating the array/tensor + * \return The created NDArray + */ + NDArray CreateZeroArray(const TShape& shape, const Context& ctx) const { + CHECK_GT(shape.Size(), 0); // Check it's a valid shape + NDArray array(shape, ctx, true, mshadow::DataType::kFlag); + array.CheckAndAlloc(); + AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) { + test::op::OperatorDataInitializer::FillZero(arr.data()); + }); + return std::move(array); + } + + nnvm::NodePtr MakeNode() const { + nnvm::NodePtr node = nnvm::Node::Create(); + node->attrs = attrs_; + return node; + } + + /*! + * \brief Get backward op executors + * \return Vector of backward executors + */ + std::vector, std::string>> GetBackward() { + std::vector, std::string>> res; + static auto gradient = nnvm::Op::GetAttr("FGradient"); + nnvm::FGradient grad_fun = gradient.get(op_, nullptr); + if (grad_fun) { + std::vector out_grads; + std::vector entries = grad_fun(MakeNode(), out_grads); + CHECK_GE(entries.size(), 1U); + res.reserve(entries.size()); + for (const nnvm::NodeEntry& node_entry : entries) { + CHECK_NOTNULL(node_entry.node.get()); + CHECK_NOTNULL(node_entry.node->op()); + CHECK_GT(node_entry.node->op()->name.size(), 0); + if (verbose_) { + std::cout << node_entry.node->op()->name << std::endl; + } + std::shared_ptr pOp = std::make_shared( + ctx().run_ctx.ctx.dev_type == Context::kGPU, outputs()[0].shape()); + res.push_back({ pOp, node_entry.node->op()->name }); + } + } + return res; + } + + void AttachResources(OpContext &ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) { + static auto& fresource = nnvm::Op::GetAttr("FResourceRequest"); + if (fresource.count(op) != 0) { + std::vector& requested = ctx.requested; + auto reqs = fresource[op](attrs); + // Get the resource of temporal space. + for (const ResourceRequest& req : reqs) { + if (req.type == ResourceRequest::kTempSpace) { + Resource r = ResourceManager::Get()->Request(ctx.run_ctx.ctx, req); + requested.push_back(r); + } else if (req.type == ResourceRequest::kRandom) { + requested.push_back(ResourceManager::Get()->Request(ctx.run_ctx.ctx, req)); + } else { + LOG(FATAL) << "resource type not yet supported"; + } + } + } + } + + public: + typedef DType DataType; + + /*! \brief Add 'fwd_op_name' to kwargs and return the new kwargs */ + static kwargs_t ArgsWithOpName(const kwargs_t& args, + const std::string& fwd_op_name, + const std::string& bwd_op_name = "") { + CHECK(!fwd_op_name.empty()); + kwargs_t new_args; + new_args.reserve(args.size() + 1); + for (const auto& a : args) { + if (a.first != COREOP_FWD_OP_NAME_KEY && a.first != COREOP_BWD_OP_NAME_KEY) { + new_args.push_back(a); + } + } + new_args.push_back({ COREOP_FWD_OP_NAME_KEY, fwd_op_name}); + if(!bwd_op_name.empty()) { + new_args.push_back({ COREOP_BWD_OP_NAME_KEY, bwd_op_name}); + } + return new_args; + } + + /*! \brief Remove 'fwd_op_name' from kwargs and return the new kwargs */ + static kwargs_t ArgsSansOpName(const kwargs_t& args, + std::string* fwd_op_name_ptr, + std::string* bwd_op_name_ptr = nullptr) { + CHECK_NOTNULL(fwd_op_name_ptr); + CHECK_NOTNULL(bwd_op_name_ptr); + bwd_op_name_ptr->resize(0); + kwargs_t new_args; + new_args.reserve(args.size()); + for (const auto& a : args) { + if (a.first == COREOP_FWD_OP_NAME_KEY) { + *fwd_op_name_ptr = a.second; + } else if(a.first == COREOP_BWD_OP_NAME_KEY) { + *bwd_op_name_ptr = a.second; + } else { + new_args.push_back(a); + } + } + return new_args; + } + + /*! + * \brief Constructor + */ + CoreOpExecutor(const bool isGPU, const TShape& shape) + : input_shape_(shape) + , op_(nullptr) { + ctx_.is_train = true; + ctx_.run_ctx.ctx.dev_id = 0; + ctx_.run_ctx.stream = nullptr; + if (isGPU) { + ctx_.run_ctx.ctx.dev_type = Context::kGPU; + allocGPUStream_.reset(new GPUStreamScope(&ctx_)); + } else { + ctx_.run_ctx.ctx.dev_type = Context::kCPU; + } + } + + /*! + * \brief Initialize the execution objects and execution data (only occurs once) + * \param args Parameter arguments + * \param inputs Optional input data (otherwise, random data will be used as input) + */ + void Init(const kwargs_t& in_args, + const std::vector& inputs = {}, + const std::vector& outputs = {}, + const CoreOpExecutor *backward_for_op = nullptr + ) { + if (!initialized_) { + initialized_ = true; + + std::string op_name, bwd_op_name; + kwargs_t args = ArgsSansOpName(in_args, &op_name, &bwd_op_name); + CHECK(op_name.empty() == false); + + CHECK(!backward_for_op || bwd_op_name.empty()) + << "Backward op should not be supplied another backward operator"; + + if(verbose_ && backward_for_op) { + std::cout << "Backward op: " << op_name; + } + + op_ = nnvm::Op::Get(op_name); + CHECK_NOTNULL(op_); + + // Set up forward + attrs_ = ParseAttrs(op_, args); + + const int num_inputs = op_->num_inputs; + + if (!inputs.empty()) { + CHECK_EQ(inputs.size(), static_cast(num_inputs)); + } + + int inferred_num_outputs, num_visible_outputs; + + imperative::SetNumOutputs(op_, attrs_, num_inputs, &inferred_num_outputs, &num_visible_outputs); + + // Generic, all shapes the same. Probably this will need to be adjusted for more complex + // operators such as dot + std::vector shapes(static_cast(std::max(num_visible_outputs, num_inputs)), + input_shape_); + + std::vector inputs_p, outputs_p; + + if (!outputs.empty()) { + CHECK_EQ(outputs.size(), static_cast(num_visible_outputs)); + } + + inputs_.reserve(num_inputs); + inputs_p.reserve(num_inputs); + outputs_.reserve(num_visible_outputs); + outputs_p.reserve(num_visible_outputs); + + for (int i = 0; i < num_inputs; ++i) { + inputs_.push_back(i < inputs.size() ? inputs[i] : CreateRandArray(shapes[i], + ctx_.run_ctx.ctx)); + inputs_p.push_back(&*inputs_.rbegin()); + } + + for (int i = 0; i < num_visible_outputs; ++i) { + // If supplied and valid, pass from the supplied outputs vector + // Otherwise use empty for forward pass, or zero-filled for backward pass + outputs_.push_back(i < outputs.size() + ? outputs[i] + : (backward_for_op ? CreateZeroArray(shapes[i], ctx_.run_ctx.ctx) + : NDArray())); + outputs_p.push_back(&*outputs_.rbegin()); + } + + if(!backward_for_op) { + DispatchMode dispatch_mode = DispatchMode::kUndefined; + imperative::SetShapeType(ctx_.run_ctx.ctx, attrs_, inputs_p, outputs_p, &dispatch_mode); + } else { + // Backward op, so set based upon inputs + CHECK_EQ(static_cast(num_visible_outputs), backward_for_op->inputs().size()); + for (int i = 0; i < num_visible_outputs; ++i) { + CHECK_LT(static_cast(i), shapes.size()); + // backward outputs should look like forward inputs + CHECK_EQ(backward_for_op->inputs()[i].shape(), outputs_[i].shape()); + } + } + + std::vector req; + imperative::SetWriteInplaceReq(inputs_p, outputs_p, &req_); + + CollectBlobs(inputs_, &blob_inputs_); + CollectBlobs(outputs_, &blob_outputs_); + + function_ = common::GetFCompute(op_, "FCompute", ctx_.run_ctx.ctx); + functionex_ = common::GetFCompute(op_, "FComputeEx", ctx_.run_ctx.ctx); + + AttachResources(ctx_, attrs_, op_); + + if(!backward_for_op) { + // Set up backward + std::vector, std::string>> bwd; + if (!bwd_op_name.empty()) { + // Backward op was specified + std::shared_ptr pOp = std::make_shared( + ctx().run_ctx.ctx.dev_type == Context::kGPU, this->outputs()[0].shape()); + bwd.push_back({pOp, bwd_op_name}); + } else { + // Try to figure out backward op + bwd = GetBackward(); + } + CHECK_GE(bwd.size(), 1U); + for (std::pair, std::string> &bw_item : bwd) { + bw_item.first->set_verbose(verbose_); + backward_.push_back(bw_item.first); + bw_item.first->Init(ArgsWithOpName(args, bw_item.second), {}, {}, this); + } + } + } + } + + template + inline bool initForward(const OpProp &opProp, std::vector *in_type) { + Init(opProp.GetArgs()); + return true; + } + + template + inline bool initBackward(const OpProp &opProp, std::vector *in_type) { return true; } + + inline void forward(const size_t count) { + perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count); + VTuneResume profile; + for (size_t i = 0; i < count; ++i) { + Execute(); + } + } + + inline void backward(const size_t count) { + perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count); + VTuneResume profile; + for (size_t i = 0; i < count; ++i) { + ExecuteBackward(); + } + } + + /*! + * \brief Execute the operator for a dense tensor + */ + void Execute() { + CHECK_EQ(initialized_, true); + CHECK_NOTNULL(function_); + function_(attrs_, ctx_, blob_inputs_, req_, blob_outputs_); + } + + /*! + * \brief Execute the operator for a sparse tensor + */ + void ExecuteEx() { + CHECK_EQ(initialized_, true); + CHECK_NOTNULL(functionex_); + functionex_(attrs_, ctx_, inputs_, req_, outputs_); + } + + /*! + * \brief Execute backward pass on operator + */ + bool ExecuteBackward() { + CHECK_EQ(initialized_, true); + CHECK(!backward_.empty()); + if (!backward_.empty()) { + // Avoid locked ref count here + for (std::shared_ptr &p : backward_) { + p->Execute(); + } + return true; + } + return false; + } + + /*! + * \brief Execute backward pass on operator + */ + bool ExecuteBackwardEx() { + CHECK_EQ(initialized_, true); + CHECK(!backward_.empty()); + if (!backward_.empty()) { + // Avoid locked ref count here + for (std::shared_ptr &p : backward_) { + p->ExecuteEx(); + } + return true; + } + return false; + } + + /*! + * \brief Get the operator context + * \return Reference to this operator's context object + */ + const OpContext& ctx() const { + return ctx_; + } + + /*! + * \brief Access input NDArray vector + * \return reference to NDArray vector of forward inputs + */ + std::vector& inputs() { return inputs_; } + const std::vector& inputs() const { return inputs_; } + + /*! + * \brief Access input NDArray vector + * \return reference to NDArray vector of forward outputs + */ + std::vector& outputs() { return outputs_; } + const std::vector& outputs() const { return outputs_; } + + /*! + * \brief Backward inputs (i.e. output grad) + * \return reference to NDArray vector of backward inputs + */ + std::vector& bwd_inputs() { + CHECK_EQ(backward_.size(), 1U); + return backward_[0]->inputs(); + } + + /*! + * \brief Backward outputs (i.e. input grad) + * \return reference to NDArray vector of backward outputs + */ + std::vector& bwd_outputs() { + CHECK_EQ(backward_.size(), 1U); + return backward_[0]->outputs(); + } + + void set_verbose(bool verbose) { + verbose_ = verbose; + } + + private: + /*! + * \brief Has the execution been initialized? + */ + bool initialized_ = false; + /*! + * \brief Whether to print debug trace output + */ + bool verbose_ = false; + /*! + * \brief This operator's context object + */ + OpContext ctx_; + /*! \brief + * Scoped GPU stream + */ + std::unique_ptr allocGPUStream_; + /*! + * \brief Input data shape + */ + TShape input_shape_; + /* + * \brief Pointer to the operator object + */ + const nnvm::Op *op_; + /*! + * \brief Operator attributes + */ + nnvm::NodeAttrs attrs_; + /*! + * \brief Input and output NDArray vectors + */ + std::vector inputs_, outputs_; + /*! + * \brief Vectors of the TBlob objects associated with the NDArrays in inputs_ and outputs_ + */ + std::vector blob_inputs_, blob_outputs_; + /*! + * \brief Operator request type vector + */ + std::vector req_; + /*! + * \brief Operator's FCompute function (for dense tensors) + */ + FCompute function_; + /*! + * \brief Operator's FCompute function (for sparse tensors) + */ + FComputeEx functionex_; + + /*! + * \brief Backward executors (if any) + */ + std::vector> backward_; +}; + +class CoreOpProp { + public: + void Init(const kwargs_t& kwargs) { kwargs_ = kwargs; } + const kwargs_t& GetArgs() const { return kwargs_; } + private: + kwargs_t kwargs_; +}; + +template +using CoreOperatorRunner = test::OperatorRunner>; + +} // namespace op +} // namespace test +} // namespace mxnet + +#endif // TEST_CORE_OP_H_ diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h new file mode 100644 index 000000000000..76f4713c46a6 --- /dev/null +++ b/tests/cpp/include/test_legacy_op.h @@ -0,0 +1,551 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file test_op.h + * \brief operator unit test utility functions + * \author Chris Olivier + * + * These classes offer a framework for developing, testing and debugging operators + * in C++. They work for both CPU and GPU modes, as well as offer a timing + * infrastructure in order to test inidividual operator performance. + * + * Operator data can be validated against general logic, + * stored scalar values (which can be generated by this code from an existing operator via + * BasicOperatorData::dumpC(), as well as against each other (ie check that + * GPU, CPU, MKL, and CUDNN operators produce the same output given the same input. + * + * test_util.h: General testing utility functionality + * test_perf.h: Performance-related classes + * test_op.h: Operator-specific testing classes + */ +#ifndef TEST_LEGACY_OP_H_ +#define TEST_LEGACY_OP_H_ + +#include "./test_op.h" +#include "./test_op_runner.h" + +namespace mxnet { +namespace test { +namespace op { + +/*! + * \brief Manage test blobs and context, and universal logic + * Create an operator from its "Prop" class and sets up the operator + * and resources for both forward and backward passes + * \tparam DType + */ +template +class LegacyOperatorExecutor : public OperatorDataInitializer + , public OperatorExecutorTiming { + public: + typedef DType DataType; + typedef AccReal AccRealType; + + /*! \brief Manage test blobs and context */ + LegacyOperatorExecutor(const bool isGPU, const TShape& topShape) +#if !MXNET_USE_CUDA + : isGPU_(false) +#else + : isGPU_(isGPU) +#endif + , initializeForward_(0) // unit testing may call inits in any order based + , initializeBackward_(0) // upon its use-case (ie may not want to run forward pass first) + , initializeCallback_(0) { + opContext_.is_train = true; + opContext_.run_ctx.stream = nullptr; + + shape_input_vec_.push_back(topShape); + } + + inline mxnet::Context getContext() { + return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{}; + } + + /*! \brief Initialize forward blob data values */ + virtual void resetForward() {} + + /*! \brief Initialize backward blob data values */ + virtual void resetBackward() {} + + /*! \brief Initialize auxiliary and output blobs */ + template + bool initForward(const OperatorPropertyType &opProp, std::vector *in_type) { + if (!initializeForward_++) { + shape_input_vec_.resize(opProp.ListArguments().size()); + op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type)); + if (op_) { + const size_t output_count = opProp.ListOutputs().size(); + const size_t aux_count = opProp.ListAuxiliaryStates().size(); + // Figure out what sort of blobs we need to allocate + std::vector out_shape, aux_shape; + out_shape.resize(output_count); + aux_shape.resize(aux_count); + opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape); + std::vector out_type(output_count, -1), aux_type(aux_count, -1); + opProp.InferType(in_type, &out_type, &aux_type); + + // Allocate top blobs (input) + for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { + int type; + if (x < in_type->size()) { + type = (*in_type)[x]; + } else { + type = x ? mshadow::DataType::kFlag : mshadow::DataType::kFlag; + } + + allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type); + } + + // Allocate aux blobs (scratch, hidden, etc.) + for (size_t x = 0, n = aux_shape.size(); x < n; ++x) { + CHECK(x < aux_type.size()); + allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]); + } + + // Allocate bottom blobs (output) + for (size_t x = 0, n = out_shape.size(); x < n; ++x) { + CHECK(x < out_type.size()); + allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]); + } + + // Get the resource of temporal space + std::vector inputShapes; + for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { + inputShapes.push_back(shape_input_vec_[x]); + } + allocateResources(opProp.ForwardResource(inputShapes)); + + resetForward(); + return true; + } + return false; + } else { + return true; + } + } + + /*! \brief Initialize auxiliary and output blobs */ + template + bool initBackward(const OperatorPropertyType &opProp, std::vector *in_type) { + initForward(opProp, in_type); + if (!initializeBackward_++) { + for (size_t x = 0, n = static_cast(opProp.NumVisibleOutputs()); x < n; ++x) { + CHECK_LT(x, c_.blob_output_vec_.size()); + allocateBlob(&c_.blob_out_grad_, c_.blob_output_vec_[x].shape_, + false, c_.blob_output_vec_[x].type_flag_); + } + + for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) { + allocateBlob(&c_.blob_in_grad_, c_.blob_input_vec_[x].shape_, + false, c_.blob_input_vec_[x].type_flag_); + } + + // Get the resource of temporal space + std::vector ishapes; + allocateResources(opProp.BackwardResource(ishapes)); + + resetBackward(); + return false; + } else { + return true; + } + } + + /*! \brief Run operator forward */ + void forward(const size_t count = 1) { + const std::vector req(c_.blob_output_vec_.size(), kWriteTo); + // Possibly move data to/from CPU and GPU (outside of timing scope) + MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? + new GPUOpData(c_, &opContext_) : nullptr)); + perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, + "Forward", count); + if (!isGPU_) { + VTuneResume profile; // VTune sample only this scope + for (size_t x = 0; x < count; ++x) { + op()->Forward(opContext_, + c_.blob_input_vec_, + req, + c_.blob_output_vec_, + c_.blob_aux_states_); + } + } else { + for (size_t x = 0; x < count; ++x) { + MXNET_CUDA_ONLY(op()->Forward(opContext_, + gpuData->blob_input_vec_, + req, + gpuData->blob_output_vec_, + gpuData->blob_aux_states_)); + } + } + } + + /*! \brief Run operator backwards */ + void backward(const size_t count = 1) { + const std::vector req(c_.blob_in_grad_.size(), kWriteTo); + // Possibly move data to/from CPU and GPU (outside of timing scope) + MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? + new GPUOpData(c_, &opContext_) : nullptr)); + perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, + "Backward", count); + if (!isGPU_) { + VTuneResume profile; // VTune sample only this scope + for (size_t x = 0; x < count; ++x) { + op()->Backward(opContext_, + c_.blob_out_grad_, + c_.blob_input_vec_, + c_.blob_output_vec_, + req, + c_.blob_in_grad_, + c_.blob_aux_states_); + } + } else { + for (size_t x = 0; x < count; ++x) { + MXNET_CUDA_ONLY(op()->Backward(opContext_, + gpuData->blob_out_grad_, + gpuData->blob_input_vec_, + gpuData->blob_output_vec_, + req, + gpuData->blob_in_grad_, + gpuData->blob_aux_states_)); + } + } + } + + /*! \brief Getter functions for the operator */ + inline Operator *op() { return op_.get(); } + inline const Operator *op() const { return op_.get(); } + + enum BlobVectorType { + kInput, + kOutput, + kAux, + kInGrad, + kOutGrad, + kBlobVectorTypeCount + }; + +#define CASE_STR(__v$) case (__v$): return #__v$ + + /*! \brief Convert BlobVectorType enum into a string */ + static inline const char *bvt2String(const BlobVectorType bvt) { + switch (bvt) { + CASE_STR(kInput); + CASE_STR(kOutput); + CASE_STR(kAux); + CASE_STR(kInGrad); + CASE_STR(kOutGrad); + default: + CHECK(false); + return ""; + } + } +#undef CASE_STR + + /*! \brief Return a particular blob in a test data set */ + inline const std::vector& getBlobVect(const BlobVectorType bvt) const { + switch (bvt) { + case kInput: + return c_.blob_input_vec_; + case kOutput: + return c_.blob_output_vec_; + case kAux: + return c_.blob_aux_states_; + case kInGrad: + return c_.blob_in_grad_; + case kOutGrad: + return c_.blob_out_grad_; + default: + CHECK(false); + return c_.blob_input_vec_; + } + } + + /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation + * When writing an operator test, you can generate a "known good operator data state" in C++ + * code with this function, and then use load() to load the blob states into this + * class (BasicOperatorData). + * After that, you can compare with the "actual" operator state (BasicOperatorData) of + * the operator that you are testing. + */ + template + inline void dumpC(Stream *_os, const std::string& label) { + Stream& os = *_os; + os << "static const std::vector< std::vector< std::vector > > ___" + << label << "_data_shape_"; + const TShape& shape = shape_input_vec_[0]; + for (size_t i = 0, n = shape.ndim(); i < n; ++i) { + os << shape[i] << "_"; + } + os << "__ =" << std::endl << "{" << std::endl; + for (size_t x = 0; x < kBlobVectorTypeCount; ++x) { + os << " { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl; + const std::vector& blobVect = getBlobVect(BlobVectorType(x)); + for (size_t i = 0, n = blobVect.size(); i < n; ++i) { + os << " { "; + test::dump(&os, blobVect[i]); + os << " }"; + if (i < n - 1) { + os << ","; + } + os << std::endl; + } + os << " }"; + if (x < kBlobVectorTypeCount - 1) { + os << ","; + } + os << std::endl; + } + os << "};" << std::endl; + } + + static inline void copy(const TBlob& blob, const DType array[], + const size_t start, const size_t end) { + const size_t blobSize = blob.Size(); + DType *p = blob.dptr(); + for (size_t i = 0, n = end - start; i < n; ++i) { + CHECK_LT(i, blobSize); + p[i] = array[i + start]; + } + } + + /*! \brief Runtime load of the C++ data code generated by dumpC() */ + void load(const std::vector>>& cData) { + for (size_t i = 0, ni = cData.size(); i < ni; ++i) { + for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) { + const TBlob& blob = getBlobVect(BlobVectorType(i))[j]; + const size_t sourceDataSize = cData[i][j].size(); + CHECK_EQ(sourceDataSize, blob.Size()); + const DType *sourceData = &cData[i][j][0]; + copy(blob, sourceData, 0, sourceDataSize); + } + } + } + + /*! \brief Runtime load of the C++ data code generated by dumpC() */ + void load(const std::vector>>& cData, + const BlobVectorType type) { + CHECK_LT(type, cData.size()); + for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) { + const TBlob& blob = getBlobVect(type)[j]; + const size_t sourceDataSize = cData[type][j].size(); + CHECK_EQ(sourceDataSize, blob.Size()); + const DType *sourceData = &cData[type][j][0]; + copy(blob, sourceData, 0, sourceDataSize); + } + } + + /*! \brief Runtime load of the C++ data code generated by dumpC() */ + void load(const std::vector>>& cData, + const BlobVectorType type, const int idx) { + CHECK_LT(type, cData.size()); + CHECK_LT(idx, cData[type].size()); + const TBlob& blob = getBlobVect(type)[idx]; + const size_t sourceDataSize = cData[type][idx].size(); + CHECK_EQ(sourceDataSize, blob.Size()); + const DType *sourceData = &cData[type][idx][0]; + copy(blob, sourceData, 0, sourceDataSize); + } + + void FillRandom() { + for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) { + std::vector *data_vect = this->c_.all_blob_vects_[j]; + if (data_vect) { + for (size_t i = 0, n = data_vect->size(); i < n; ++i) { + OperatorDataInitializer::FillRandom((*data_vect)[i]); + } + } + } + } + + std::vector& inputs() { + return c_.blob_input_vec_; + } + std::vector& outputs() { + return c_.blob_output_vec_; + } + std::vector& bwd_inputs() { + return c_.blob_out_grad_; + } + std::vector& bwd_outputs() { + return c_.blob_in_grad_; + } + + /*! \brief Input and output blobs */ + OpContext opContext_; + + std::vector shape_input_vec_; + + struct OpData { + std::vector blob_input_vec_; + std::vector blob_output_vec_; + std::vector blob_aux_states_; + std::vector blob_in_grad_; + std::vector blob_out_grad_; // Remaining err (loss) pushing back upstream + + std::vector *> all_blob_vects_; + inline OpData() { + all_blob_vects_.push_back(&blob_input_vec_); + all_blob_vects_.push_back(&blob_output_vec_); + all_blob_vects_.push_back(&blob_aux_states_); + all_blob_vects_.push_back(&blob_in_grad_); + all_blob_vects_.push_back(&blob_out_grad_); // Remaining err (loss) pushing back upstream + } + virtual ~OpData() {} + }; + +#if MXNET_USE_CUDA + class GPUOpData : public OpData { + GPUOpData() = delete; + GPUOpData(const GPUOpData& o) = delete; + + public: + inline GPUOpData(const OpData& cpuData, OpContext *opContext) + : cpuData_(cpuData) + , allocGPUStream_(opContext) { + // Copy CPU->GPU + CHECK_EQ(gpuBlobs_.size(), 0U); + CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size()); + for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) { + std::vector& bv_src = *cpuData_.all_blob_vects_[bvt]; + std::vector& bvt_dest = *this->all_blob_vects_[bvt]; + for (size_t i = 0, n = bv_src.size(); i < n; ++i) { + const TBlob& srcBlob = bv_src[i]; + TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, + true, srcBlob.type_flag_); + + Context cpu_ctx, gpu_ctx; + cpu_ctx.dev_type = Context::kCPU; + gpu_ctx.dev_type = Context::kGPU; + cpu_ctx.dev_id = gpu_ctx.dev_id = 0; + + mxnet::ndarray::Copy(srcBlob, destBlob, cpu_ctx, + gpu_ctx, allocGPUStream_.opContext_.run_ctx); + } + } + cudaDeviceSynchronize(); + } + inline ~GPUOpData() { + // Copy GPU->CPU + cudaDeviceSynchronize(); + for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) { + std::vector& bv_src = *this->all_blob_vects_[bvt]; + std::vector& bvt_dest = *cpuData_.all_blob_vects_[bvt]; + for (size_t i = 0, n = bv_src.size(); i < n; ++i) { + const TBlob& srcBlob = bv_src[i]; + TBlob *destBlob = &bvt_dest[i]; + + Context cpu_ctx, gpu_ctx; + cpu_ctx.dev_type = Context::kCPU; + gpu_ctx.dev_type = Context::kGPU; + cpu_ctx.dev_id = gpu_ctx.dev_id = 0; + + mxnet::ndarray::Copy(srcBlob, destBlob, gpu_ctx, + cpu_ctx, allocGPUStream_.opContext_.run_ctx); + } + } + gpuBlobs_.clear(); // Force deallocation of the GPU blob data + cudaDeviceSynchronize(); + } + + private: + /*! \brief Reference to the src/dest CPU data */ + const OpData& cpuData_; + /*! \brief The GPU-allocated blobs */ + std::list> gpuBlobs_; + /*! \brief Scoped GPU stream */ + GPUStreamScope allocGPUStream_; + }; +#endif // MXNET_USE_CUDA + + protected: + OpData c_; + + /*! \brief Allocate the operator's resource requests */ + void allocateResources(const std::vector& reqs) { + std::map cached_temp; + + Context ctx; + ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU; + ctx.dev_id = 0; + + for (const ResourceRequest& req : reqs) { + if (req.type == ResourceRequest::kTempSpace) { + if (cached_temp.count(ctx) != 0) { + opContext_.requested.push_back(cached_temp.at(ctx)); + } else { + Resource r = ResourceManager::Get()->Request(ctx, req); + opContext_.requested.push_back(r); + cached_temp[ctx] = r; + } + } else if (req.type == ResourceRequest::kRandom) { + opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req)); + } else { + LOG(FATAL) << "resource type not yet supported"; + } + } + } + + /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ + static TBlob *allocateBlob(std::list> *standalone_blobs, + std::vector *dest, + const TShape& shape, + const bool isGPU, + const int dtype) { + test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype); + CHECK_NE(blob, static_cast(nullptr)); + standalone_blobs->push_back(std::unique_ptr(blob)); + (*dest).push_back(*blob); + return blob; + } + + /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ + inline TBlob *allocateBlob(std::vector *dest, const TShape& shape, + const bool isGPU, const int dtype) { + return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype); + } + + /*! \brief Performance timing categories */ + enum TimingId { + Forward, + Backward + }; + + /*! \brief The operator */ + std::unique_ptr op_; + /*! \brief Is this for a GPU? */ + const bool isGPU_; + /*! \brief Assure that the Forward initialized only once */ + std::atomic initializeForward_; + /*! \brief Assure that the Forward initialized only once */ + std::atomic initializeBackward_; + /*! \brief Assure that the callback is initialized only once */ + std::atomic initializeCallback_; + /*! \brief scoped lifecycle management of allocated blobs */ + std::list> standalone_blobs_; +}; + +template +using LegacyOpRunner = mxnet::test::OperatorRunner>; + +} // namespace op +} // namespace test +} // namespace mxnet + +#endif // TEST_LEGACY_OP_H_ diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h index 4b46b80b597d..54cd99a32fc6 100644 --- a/tests/cpp/include/test_op.h +++ b/tests/cpp/include/test_op.h @@ -63,6 +63,9 @@ namespace op { #endif #if MXNET_USE_CUDA +/*! + * \brief Maintain the lifecycle of a GPU stream + */ struct GPUStreamScope { explicit inline GPUStreamScope(OpContext *opContext) : opContext_(*opContext) { @@ -83,500 +86,62 @@ struct GPUStreamScope { #endif // MXNET_USE_CUDA /*! - * \brief Manage test blobs and context, and universal logic - * Create an operator from its "Prop" class and sets up the operator - * and resources for both forward and backward passes - * \tparam DType + * \brief Base class for operator test-data classes */ -template -class BasicOperatorData { +template +class OperatorDataInitializer { public: - /*! \brief Manage test blobs and context */ - BasicOperatorData(const bool isGPU, const TShape& topShape) -#if !MXNET_USE_CUDA - : isGPU_(false) -#else - : isGPU_(isGPU) -#endif - , initializeForward_(0) // unit testing may call inits in any order based - , initializeBackward_(0) // upon its use-case (ie may not want to run forward pass first) - , initializeCallback_(0) - , generator_(new std::mt19937()) { - opContext_.is_train = true; - opContext_.run_ctx.stream = nullptr; - - shape_input_vec_.push_back(topShape); - } - - inline mxnet::Context getContext() { - return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{}; - } - - /*! \brief Initialize forward blob data values */ - virtual void resetForward() {} - - /*! \brief Initialize backward blob data values */ - virtual void resetBackward() {} - - /*! \brief Initialize auxiliary and output blobs */ - virtual bool initForward(const OperatorProperty &opProp, std::vector *in_type) { - if (!initializeForward_++) { - shape_input_vec_.resize(opProp.ListArguments().size()); - op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type)); - if (op_) { - const size_t output_count = opProp.ListOutputs().size(); - const size_t aux_count = opProp.ListAuxiliaryStates().size(); - // Figure out what sort of blobs we need to allocate - std::vector out_shape, aux_shape; - out_shape.resize(output_count); - aux_shape.resize(aux_count); - opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape); - std::vector out_type(output_count, -1), aux_type(aux_count, -1); - opProp.InferType(in_type, &out_type, &aux_type); - - // Allocate top blobs (input) - for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { - int type; - if (x < in_type->size()) { - type = (*in_type)[x]; - } else { - type = x ? mshadow::DataType::kFlag : mshadow::DataType::kFlag; - } - - allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type); - } - - // Allocate aux blobs (scratch, hidden, etc.) - for (size_t x = 0, n = aux_shape.size(); x < n; ++x) { - CHECK(x < aux_type.size()); - allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]); - } - - // Allocate bottom blobs (output) - for (size_t x = 0, n = out_shape.size(); x < n; ++x) { - CHECK(x < out_type.size()); - allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]); - } - - // Get the resource of temporal space - std::vector inputShapes; - for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { - inputShapes.push_back(shape_input_vec_[x]); - } - allocateResources(opProp.ForwardResource(inputShapes)); - - resetForward(); - return true; - } - return false; - } else { - return true; - } - } - - /*! \brief Initialize auxiliary and output blobs */ - virtual bool initBackward(const OperatorProperty &opProp, std::vector *in_type) { - initForward(opProp, in_type); - if (!initializeBackward_++) { - for (size_t x = 0, n = static_cast(opProp.NumVisibleOutputs()); x < n; ++x) { - CHECK_LT(x, c_.blob_output_vec_.size()); - allocateBlob(&c_.blob_out_grad_, c_.blob_output_vec_[x].shape_, - false, c_.blob_output_vec_[x].type_flag_); - } - - for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) { - allocateBlob(&c_.blob_in_grad_, c_.blob_input_vec_[x].shape_, - false, c_.blob_input_vec_[x].type_flag_); - } - - // Get the resource of temporal space - std::vector ishapes; - allocateResources(opProp.BackwardResource(ishapes)); - - resetBackward(); - return false; - } else { - return true; - } - } - - /*! \brief Run operator forward */ - void forward(const size_t count = 1) { - const std::vector req(c_.blob_output_vec_.size(), kWriteTo); - // Possibly move data to/from CPU and GPU (outside of timing scope) - MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? - new GPUOpData(c_, &opContext_) : nullptr)); - perf::TimingItem timeF(&timing_, Forward, "Forward", count); - if (!isGPU_) { - VTuneResume profile; // VTune sample only this scope - for (size_t x = 0; x < count; ++x) { - op()->Forward(opContext_, - c_.blob_input_vec_, - req, - c_.blob_output_vec_, - c_.blob_aux_states_); - } - } else { - for (size_t x = 0; x < count; ++x) { - MXNET_CUDA_ONLY(op()->Forward(opContext_, - gpuData->blob_input_vec_, - req, - gpuData->blob_output_vec_, - gpuData->blob_aux_states_)); - } - } + OperatorDataInitializer() + : generator_(new std::mt19937()) { } - /*! \brief Run operator backwards */ - void backward(const size_t count = 1) { - const std::vector req(c_.blob_output_vec_.size(), kWriteTo); - // Possibly move data to/from CPU and GPU (outside of timing scope) - MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? - new GPUOpData(c_, &opContext_) : nullptr)); - perf::TimingItem timeB(&timing_, Backward, "Backward", count); - if (!isGPU_) { - VTuneResume profile; // VTune sample only this scope - for (size_t x = 0; x < count; ++x) { - op()->Backward(opContext_, - c_.blob_out_grad_, - c_.blob_input_vec_, - c_.blob_output_vec_, - req, - c_.blob_in_grad_, - c_.blob_aux_states_); - } - } else { - for (size_t x = 0; x < count; ++x) { - MXNET_CUDA_ONLY(op()->Backward(opContext_, - gpuData->blob_out_grad_, - gpuData->blob_input_vec_, - gpuData->blob_output_vec_, - req, - gpuData->blob_in_grad_, - gpuData->blob_aux_states_)); - } - } - } - - /*! \brief Getter functions for the operator */ - inline Operator *op() { return op_.get(); } - inline const Operator *op() const { return op_.get(); } - - enum BlobVectorType { - kInput, - kOutput, - kAux, - kInGrad, - kOutGrad, - kBlobVectorTypeCount - }; - - #define CASE_STR(__v$) case (__v$): return #__v$ - - /*! \brief Convert BlobVectorType enum into a string */ - static inline const char *bvt2String(const BlobVectorType bvt) { - switch (bvt) { - CASE_STR(kInput); - CASE_STR(kOutput); - CASE_STR(kAux); - CASE_STR(kInGrad); - CASE_STR(kOutGrad); - default: - CHECK(false); - return ""; - } - } - #undef CASE_STR - - /*! \brief Return a particular blob in a test data set */ - inline const std::vector& getBlobVect(const BlobVectorType bvt) const { - switch (bvt) { - case kInput: - return c_.blob_input_vec_; - case kOutput: - return c_.blob_output_vec_; - case kAux: - return c_.blob_aux_states_; - case kInGrad: - return c_.blob_in_grad_; - case kOutGrad: - return c_.blob_out_grad_; - default: - CHECK(false); - return c_.blob_input_vec_; - } - } - - /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation - * When writing an operator test, you can generate a "known good operator data state" in C++ - * code with this function, and then use load() to load the blob states into this - * class (BasicOperatorData). - * After that, you can compare with the "actual" operator state (BasicOperatorData) of - * the operator that you are testing. + /*! + * \brief Fill a blob with random values + * \param blob Blob which to fill with random values */ - template - inline void dumpC(Stream *_os, const std::string& label) { - Stream& os = *_os; - os << "static const std::vector< std::vector< std::vector > > ___" - << label << "_data_shape_"; - const TShape& shape = shape_input_vec_[0]; - for (size_t i = 0, n = shape.ndim(); i < n; ++i) { - os << shape[i] << "_"; - } - os << "__ =" << std::endl << "{" << std::endl; - for (size_t x = 0; x < kBlobVectorTypeCount; ++x) { - os << " { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl; - const std::vector& blobVect = getBlobVect(BlobVectorType(x)); - for (size_t i = 0, n = blobVect.size(); i < n; ++i) { - os << " { "; - test::dump(&os, blobVect[i]); - os << " }"; - if (i < n - 1) { - os << ","; - } - os << std::endl; - } - os << " }"; - if (x < kBlobVectorTypeCount - 1) { - os << ","; - } - os << std::endl; - } - os << "};" << std::endl; - } - - static inline void copy(const TBlob& blob, const DType array[], - const size_t start, const size_t end) { - const size_t blobSize = blob.Size(); - DType *p = blob.dptr(); - for (size_t i = 0, n = end - start; i < n; ++i) { - CHECK_LT(i, blobSize); - p[i] = array[i + start]; - } - } - - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData) { - for (size_t i = 0, ni = cData.size(); i < ni; ++i) { - for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) { - const TBlob& blob = getBlobVect(BlobVectorType(i))[j]; - const size_t sourceDataSize = cData[i][j].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[i][j][0]; - copy(blob, sourceData, 0, sourceDataSize); - } - } - } - - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData, - const BlobVectorType type) { - CHECK_LT(type, cData.size()); - for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) { - const TBlob& blob = getBlobVect(type)[j]; - const size_t sourceDataSize = cData[type][j].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[type][j][0]; - copy(blob, sourceData, 0, sourceDataSize); - } - } - - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData, - const BlobVectorType type, const int idx) { - CHECK_LT(type, cData.size()); - CHECK_LT(idx, cData[type].size()); - const TBlob& blob = getBlobVect(type)[idx]; - const size_t sourceDataSize = cData[type][idx].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[type][idx][0]; - copy(blob, sourceData, 0, sourceDataSize); - } - - void FillRandom() { + void FillRandom(const TBlob& blob) const { std::uniform_real_distribution distribution(-1.0, 1.0); - for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) { - std::vector *data_vect = this->c_.all_blob_vects_[j]; - if (data_vect) { - for (size_t i = 0, n = data_vect->size(); i < n; ++i) { - TBlob &blob = (*data_vect)[i]; - test::patternFill(&blob, [this, &distribution]() -> DType { - return distribution(generator()); - }); - } - } - } - } - - /*! \brief Input and output blobs */ - OpContext opContext_; - - std::vector shape_input_vec_; - - struct OpData { - std::vector blob_input_vec_; - std::vector blob_output_vec_; - std::vector blob_aux_states_; - std::vector blob_in_grad_; - std::vector blob_out_grad_; // Remaining err (loss) pushing back upstream - - std::vector *> all_blob_vects_; - inline OpData() { - all_blob_vects_.push_back(&blob_input_vec_); - all_blob_vects_.push_back(&blob_output_vec_); - all_blob_vects_.push_back(&blob_aux_states_); - all_blob_vects_.push_back(&blob_in_grad_); - all_blob_vects_.push_back(&blob_out_grad_); // Remaining err (loss) pushing back upstream - } - virtual ~OpData() {} - }; - -#if MXNET_USE_CUDA - class GPUOpData : public OpData { - GPUOpData() = delete; - GPUOpData(const GPUOpData& o) = delete; - - public: - inline GPUOpData(const OpData& cpuData, OpContext *opContext) - : cpuData_(cpuData) - , allocGPUStream_(opContext) { - // Copy CPU->GPU - CHECK_EQ(gpuBlobs_.size(), 0U); - CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size()); - for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) { - std::vector& bv_src = *cpuData_.all_blob_vects_[bvt]; - std::vector& bvt_dest = *this->all_blob_vects_[bvt]; - for (size_t i = 0, n = bv_src.size(); i < n; ++i) { - const TBlob& srcBlob = bv_src[i]; - TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, - true, srcBlob.type_flag_); - - Context cpu_ctx, gpu_ctx; - cpu_ctx.dev_type = Context::kCPU; - gpu_ctx.dev_type = Context::kGPU; - cpu_ctx.dev_id = gpu_ctx.dev_id = 0; - - mxnet::ndarray::Copy(srcBlob, destBlob, cpu_ctx, - gpu_ctx, allocGPUStream_.opContext_.run_ctx); - } - } - cudaDeviceSynchronize(); - } - inline ~GPUOpData() { - // Copy GPU->CPU - cudaDeviceSynchronize(); - for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) { - std::vector& bv_src = *this->all_blob_vects_[bvt]; - std::vector& bvt_dest = *cpuData_.all_blob_vects_[bvt]; - for (size_t i = 0, n = bv_src.size(); i < n; ++i) { - const TBlob& srcBlob = bv_src[i]; - TBlob *destBlob = &bvt_dest[i]; - - Context cpu_ctx, gpu_ctx; - cpu_ctx.dev_type = Context::kCPU; - gpu_ctx.dev_type = Context::kGPU; - cpu_ctx.dev_id = gpu_ctx.dev_id = 0; - - mxnet::ndarray::Copy(srcBlob, destBlob, gpu_ctx, - cpu_ctx, allocGPUStream_.opContext_.run_ctx); - } - } - gpuBlobs_.clear(); // Force deallocation of the GPU blob data - cudaDeviceSynchronize(); - } - - private: - /*! \brief Reference to the src/dest CPU data */ - const OpData& cpuData_; - /*! \brief The GPU-allocated blobs */ - std::list> gpuBlobs_; - /*! \brief Scoped GPU stream */ - GPUStreamScope allocGPUStream_; - }; -#endif // MXNET_USE_CUDA - - OpData c_; - - protected: - /*! \brief Allocate the operator's resource requests */ - void allocateResources(const std::vector& reqs) { - std::map cached_temp; - - Context ctx; - ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU; - ctx.dev_id = 0; - - for (const ResourceRequest& req : reqs) { - if (req.type == ResourceRequest::kTempSpace) { - if (cached_temp.count(ctx) != 0) { - opContext_.requested.push_back(cached_temp.at(ctx)); - } else { - Resource r = ResourceManager::Get()->Request(ctx, req); - opContext_.requested.push_back(r); - cached_temp[ctx] = r; - } - } else if (req.type == ResourceRequest::kRandom) { - opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req)); - } else { - LOG(FATAL) << "resource type not yet supported"; - } - } + test::patternFill(&blob, [this, &distribution]() -> DType { + return distribution(this->generator()); + }); } - /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ - static TBlob *allocateBlob(std::list> *standalone_blobs, - std::vector *dest, - const TShape& shape, - const bool isGPU, - const int dtype) { - test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype); - CHECK_NE(blob, static_cast(nullptr)); - standalone_blobs->push_back(std::unique_ptr(blob)); - (*dest).push_back(*blob); - return blob; - } - - /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ - inline TBlob *allocateBlob(std::vector *dest, const TShape& shape, - const bool isGPU, const int dtype) { - return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype); + void FillZero(const TBlob& blob) const { + std::uniform_real_distribution distribution(-1.0, 1.0); + test::patternFill(&blob, [this, &distribution]() -> DType { + return DType(0); + }); } - /*! \brief mt19937 generator for random number generator */ - std::mt19937& generator() { return *generator_; } - - /*! \brief Performance timing categories */ - enum TimingId { - Forward, - Backward - }; - - /*! \brief The operator */ - std::unique_ptr op_; - /*! \brief Is this for a GPU? */ - const bool isGPU_; - /*! \brief Assure that the Forward initialized only once */ - std::atomic initializeForward_; - /*! \brief Assure that the Forward initialized only once */ - std::atomic initializeBackward_; - /*! \brief Assure that the callback is initialized only once */ - std::atomic initializeCallback_; - /*! \brief scoped lifecycle management of allocated blobs */ - std::list> standalone_blobs_; + private: + /*! + * \brief mt19937 generator for random number generator + * \return reference to mt19937 generator object + */ + std::mt19937& generator() const { return *generator_; } /*! \brief Per-test generator */ std::unique_ptr generator_; +}; +//template +class OperatorExecutorTiming { public: + //typedef DType DataType; + //typedef AccReal AccRealType; + + inline test::perf::TimingInstrument& GetTiming() { return timing_; } + + private: /*! Timing instrumentation */ test::perf::TimingInstrument timing_; }; /*! \brief Top-level operator test state info structure */ -template +template struct OpInfo { /*! \brief The operator data */ - std::shared_ptr< test::op::BasicOperatorData > data_; + std::shared_ptr< OperatorExecutor > executor_; /*! \brief The operator prop class */ std::shared_ptr prop_; /*! \brief The input type(s) */ @@ -584,12 +149,12 @@ struct OpInfo { }; /*! \brief Pair of op info objects, generally for validating ops against each other */ -template +template struct OpInfoPair { /*! \brief Operator item 1 */ - test::op::OpInfo info_1_; + test::op::OpInfo info_1_; /*! \brief Operator item 2 */ - test::op::OpInfo info_2_; + test::op::OpInfo info_2_; }; /*! \brief Base validator class for validating test data */ @@ -685,46 +250,24 @@ class Validator { } return true; } - - /*! \brief Compare similar blobs in two operator data structs */ - static bool compare( - const test::op::BasicOperatorData& i1, - const test::op::BasicOperatorData& i2, - const typename test::op::BasicOperatorData::BlobVectorType bvt, - const size_t idx, bool print = false) { - const std::vector& bv1 = i1.getBlobVect(bvt); - const std::vector& bv2 = i2.getBlobVect(bvt); - - // If this is an invalid index, at least make sure the two blob vects - // are similarly too small for the index - if (bv1.size() <= idx) { - CHECK(bv1.size() == bv2.size()); - return true; - } - const TBlob& b1 = bv1[idx]; - const TBlob& b2 = bv2[idx]; - if (print && test::debug_output) { - test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true); - test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true); - } - return compare(b1, b2); - } }; /*! \brief Operator Prop argument key/value pairs */ typedef std::vector > kwargs_t; /*! \brief Create operator data, prop, the operator itself and init default forward input */ -template -static test::op::OpInfo createOpAndInfoF(const bool isGPU, - const TShape &inputShape, - const kwargs_t &kwargs) { - test::op::OpInfo info; - info.data_ = std::make_shared(isGPU, inputShape); +template< + typename OperatorProp, + typename OperatorExecutor, + typename ...Args> +static test::op::OpInfo createOpAndInfoF(const kwargs_t &kwargs, + Args... args) { + test::op::OpInfo info; + info.executor_ = std::make_shared(args...); info.prop_ = std::make_shared(); - info.in_type_ = { mshadow::DataType::kFlag }; + info.in_type_ = { mshadow::DataType::kFlag }; info.prop_->Init(kwargs); - info.data_->initForward(*info.prop_, &info.in_type_); + info.executor_->initForward(*info.prop_, &info.in_type_); return info; } diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 6d0b766eb378..06a5a2e6d217 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -33,52 +33,17 @@ namespace mxnet { namespace test { -/*! - * \brief Generic operator random test data - * \tparam DType Main data type - * \tparam AccReal Secondary data type (if any) - */ -template -class GenericOperatorData : public test::op::BasicOperatorData { - public: - typedef DType DataType; - typedef AccReal AccRealType; - - /*! - * \brief Constructor - * \param isGPU Is this to be used on GPU? - * \param inputShape Input shape to the operator - */ - GenericOperatorData(const bool isGPU, const TShape& inputShape) - : test::op::BasicOperatorData(isGPU, inputShape) { - } - - /*! - * \brief Reset forward pass by filling everything with random values - */ - void resetForward() override { - test::op::BasicOperatorData::FillRandom(); - } - - /*! - * \brief Reset backward pass by filling everything with random values - */ - void resetBackward() override { - test::op::BasicOperatorData::FillRandom(); - } -}; - /*! * \brief Generic operator runner * \tparam OperatorProp property class for a given operator (i.e. FullyConnectedProp, BatchNormProp) - * \tparam OperatorDataContainer Data container for forward and backward passes for some given + * \tparam OperatorExecutor Data container for forward and backward passes for some given * data types */ -template +template class OperatorRunner { public: - typedef typename OperatorDataContainer::DataType DType; - typedef typename OperatorDataContainer::AccRealType AccReal; + typedef typename OperatorExecutor::DataType DType; + //typedef typename OperatorExecutor::AccRealType AccReal; /*! * \brief Test operator forward pass @@ -89,7 +54,7 @@ class OperatorRunner { * \param count Number of times to run in each direction * \return OpInfo object for further opereator analysis */ - test::op::OpInfo + test::op::OpInfo RunGenericOperatorForward( bool isGPU, const TShape &inputShape, @@ -102,11 +67,10 @@ class OperatorRunner { #else isGPU = false; #endif - test::op::OpInfo info = - test::op::createOpAndInfoF(isGPU, inputShape, kwargs); - info.data_->initForward(*info.prop_, &info.in_type_); - info.data_->forward(count); + test::op::OpInfo info = + test::op::createOpAndInfoF(kwargs, isGPU, inputShape); + info.executor_->initForward(*info.prop_, &info.in_type_); + info.executor_->forward(count); return info; } @@ -116,11 +80,11 @@ class OperatorRunner { * \param count * \return OpInfo object for further opereator analysis */ - test::op::OpInfo RunGenericOperatorBackward( - test::op::OpInfo *info, + test::op::OpInfo RunGenericOperatorBackward( + test::op::OpInfo *info, const size_t count = 1) { - info->data_->initBackward(*info->prop_, &info->in_type_); - info->data_->backward(count); + info->executor_->initBackward(*info->prop_, &info->in_type_); + info->executor_->backward(count); return *info; } @@ -133,12 +97,12 @@ class OperatorRunner { * \param count Number of times to run in each direction * \return */ - test::op::OpInfo RunBidirectional( + test::op::OpInfo RunBidirectional( bool isGPU, const TShape &inputShape, const std::vector > &kwargs, const size_t count = 1) { - test::op::OpInfo info = + test::op::OpInfo info = RunGenericOperatorForward(isGPU, inputShape, kwargs, count); return RunGenericOperatorBackward(&info, count); } @@ -204,7 +168,7 @@ class OperatorRunner { const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U); - test::op::OpInfo info; + test::op::OpInfo info; switch (D) { case 0: info = RunGenericOperatorForward(isGPU, @@ -239,9 +203,9 @@ class OperatorRunner { default: CHECK(false) << "Unsupported dimension count: " << (D + 1); } - if (info.data_.get()) { + if (info.executor_.get()) { RunGenericOperatorBackward(&info, count); - timing += info.data_->timing_; + timing += info.executor_->GetTiming(); } } while (false); diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h index 492a0783d227..29260559ae73 100644 --- a/tests/cpp/include/test_util.h +++ b/tests/cpp/include/test_util.h @@ -534,7 +534,7 @@ inline std::string demangle(const char *name) { * 3D: batch item -> channel -> col */ template -static inline void patternFill(TBlob *blob, GetNextData getNextData) { +static inline void patternFill(const TBlob *blob, GetNextData getNextData) { const size_t dim = blob->ndim(); CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)"; const size_t num = blob->size(0); diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index c0a42173c003..ba7bc21076f8 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -24,10 +24,10 @@ */ #include -#include #include #include "../../src/operator/activation-inl.h" #include "../include/test_op_runner.h" +#include "../include/test_legacy_op.h" using namespace mxnet; @@ -41,8 +41,7 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); - test::OperatorRunner> runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, shape, kwargs, 1); } @@ -53,7 +52,7 @@ TEST(ACTIVATION_PERF, TimingCPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::OperatorRunner> runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, {10, 10, 10, 10}, kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { @@ -83,7 +82,8 @@ TEST(ACTIVATION_PERF, TimingGPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::OperatorRunner> runner; + test::OperatorRunner> runner; runner.RunBidirectional(true, {10, 10, 10, 10}, kwargs, 1); // prime code and cache std::vector shapes = { {1, 1, 28, 28}, @@ -97,3 +97,4 @@ TEST(ACTIVATION_PERF, TimingGPU) { } } #endif // MXNET_USE_CUDA == 1 + diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index 0eca871c3e22..51376a281a34 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -27,7 +27,7 @@ #include #include "../../src/operator/batch_norm-inl.h" #include "../../src/operator/batch_norm_v1-inl.h" -#include "test_op.h" +#include "./test_legacy_op.h" #include "executor/exec_pass.h" using namespace mxnet; @@ -57,11 +57,124 @@ static constexpr int TIMING_DEPTH = 2; static constexpr int TIMING_DH = 28; static constexpr int TIMING_DW = 28; + +/*! \brief BatchNorm-specific test data */ +template +class BNOperatorExecutor : public test::op::LegacyOperatorExecutor { + public: + BNOperatorExecutor(const bool isGPU, const TShape& inputShape, const bool hasWeightAndBias = false) + : test::op::LegacyOperatorExecutor(isGPU, inputShape) + , hasWeightAndBias_(hasWeightAndBias) { + } + + void resetForward() override { + // Init input data + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_, + DTypeX, + { + DTypeX val = 0; + test::patternFill(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData], + [&val]{ return val += 1; }); }); + + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, { + const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma]; + test::fill(blob, DTypeX(1)); + if (hasWeightAndBias_) { + if (blob.size(0) > 1) { + blob.dptr()[1] = DTypeX(3); + } + } + }); + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, { + const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta]; + if (!hasWeightAndBias_) { + test::fill(blob, DTypeX(0)); + } else { // This will cause forward pass check to fail when calculating sum == 0 + test::fill(blob, DTypeX(1)); + if (blob.size(0) > 0) { + blob.dptr()[0] = DTypeX(3); + } + } + }); + + // Init the moving data (all mean = 0, all var = 1) + MSHADOW_TYPE_SWITCH( + this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_, + DTypeX, { + test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0)); + }); + MSHADOW_TYPE_SWITCH( + this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_, + DTypeX, { + test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));}); + + for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) { + const int dtype = this->c_.blob_output_vec_[i].type_flag_; + MSHADOW_TYPE_SWITCH(dtype, DTypeX, + { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); }); + } + } + + void resetBackward() override { + DType val = -.001; + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_, + DTypeX, { + test::patternFill(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut], + [&val]{ return val += 1; }); + }); + + // out-grad weights + if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); }); + } + + // out-grad biases + if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); }); + } + + // in-grad + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); }); + + // in-grad weights + if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); }); + } + + // in-grad biases + if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); }); + } + } + + const bool hasWeightAndBias_; // This will cause forward pass validation to fail +}; + /*! \brief Validate batch norm test outputs */ template class BatchNormValidator : public test::op::Validator { typedef test::op::Validator Super; - using Super::compare; /*! \brief Only static functions in this class */ BatchNormValidator() = delete; @@ -220,10 +333,40 @@ class BatchNormValidator : public test::op::Validator { } public: + + template + static inline bool compare(const ExecutorType& i1, + const ExecutorType& i2, + const typename + test::op::LegacyOperatorExecutor::BlobVectorType bvt, + const size_t idx, bool print = false) { + // Validate legacy data + auto *legacy1 = dynamic_cast *>(&i1); + auto *legacy2 = dynamic_cast *>(&i2); + CHECK_NOTNULL(legacy1); + CHECK_NOTNULL(legacy2); + const std::vector &bv1 = legacy1->getBlobVect(bvt); + const std::vector &bv2 = legacy2->getBlobVect(bvt); + + // If this is an invalid index, at least make sure the two blob vects + // are similarly too small for the index + if (bv1.size() <= idx) { + CHECK(bv1.size() == bv2.size()); + return true; + } + const TBlob &b1 = bv1[idx]; + const TBlob &b2 = bv2[idx]; + if (print && test::debug_output) { + test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true); + test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true); + } + return test::op::Validator::compare(b1, b2); + } + /*! \brief Check batch norm output */ template - static void validateForward(const BNOperatorProp& data) { - const TBlob& outputBlob = data.c_.blob_output_vec_[mxnet::op::batchnorm::kData]; + static void validateForward(BNOperatorProp& data) { + const TBlob& outputBlob = data.outputs()[mxnet::op::batchnorm::kData]; switch (outputBlob.ndim()) { case 3: checkBatchNorm1D(&outputBlob); @@ -242,169 +385,57 @@ class BatchNormValidator : public test::op::Validator { /*! \brief Compare entire operator data between two test sets */ template - static void compare(const test::op::OpInfo& info_1, - const test::op::OpInfo& info_2) { + static void compare( + const test::op::OpInfo>& info_1, + const test::op::OpInfo>& info_2) { // Input - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kData)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kGamma)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kBeta)); // Output - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutput, op::batchnorm::kOut)); CHECK_EQ(info_2.prop_->getParam().use_global_stats, info_1.prop_->getParam().use_global_stats); #if MXNET_USE_CUDNN != 1 /* CUDNN takes a different approach here on first pass */ // Aux - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kAux, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kAux, op::batchnorm::kMovingMean)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kAux, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kAux, op::batchnorm::kMovingVar)); #endif if (!info_2.prop_->getParam().use_global_stats) { - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutput, op::batchnorm::kMean)); // InGrad - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kData)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kGamma)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kBeta)); // OutGrad - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutGrad, op::batchnorm::kData)); } } }; -/*! \brief BatchNorm-specific test data */ -template -class BNOperatorData : public test::op::BasicOperatorData { - public: - BNOperatorData(const bool isGPU, const TShape& inputShape, const bool hasWeightAndBias = false) - : test::op::BasicOperatorData(isGPU, inputShape) - , hasWeightAndBias_(hasWeightAndBias) { - } - - void resetForward() override { - // Init input data - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_, - DTypeX, - { - DTypeX val = 0; - test::patternFill(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData], - [&val]{ return val += 1; }); }); - - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, { - const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma]; - test::fill(blob, DTypeX(1)); - if (hasWeightAndBias_) { - if (blob.size(0) > 1) { - blob.dptr()[1] = DTypeX(3); - } - } - }); - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, { - const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta]; - if (!hasWeightAndBias_) { - test::fill(blob, DTypeX(0)); - } else { // This will cause forward pass check to fail when calculating sum == 0 - test::fill(blob, DTypeX(1)); - if (blob.size(0) > 0) { - blob.dptr()[0] = DTypeX(3); - } - } - }); - - // Init the moving data (all mean = 0, all var = 1) - MSHADOW_TYPE_SWITCH( - this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_, - DTypeX, { - test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0)); - }); - MSHADOW_TYPE_SWITCH( - this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_, - DTypeX, { - test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));}); - - for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) { - const int dtype = this->c_.blob_output_vec_[i].type_flag_; - MSHADOW_TYPE_SWITCH(dtype, DTypeX, - { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); }); - } - } - - void resetBackward() override { - DType val = -.001; - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_, - DTypeX, { - test::patternFill(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut], - [&val]{ return val += 1; }); - }); - - // out-grad weights - if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); }); - } - - // out-grad biases - if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); }); - } - - // in-grad - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); }); - - // in-grad weights - if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); }); - } - - // in-grad biases - if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); }); - } - } - - const bool hasWeightAndBias_; // This will cause forward pass validation to fail -}; - static const test::op::kwargs_t blank_kwargs; static const test::op::kwargs_t blank_kwargs_nocudnn = { {"cudnn_off", "True"} }; @@ -433,22 +464,20 @@ static bool isUGS(const test::op::kwargs_t& kwargs) { } #endif // DISABLE_VALIDATION -template -static StreamType& PRT( - StreamType *os, - const test::op::BasicOperatorData& obj, - const typename test::op::BasicOperatorData::BlobVectorType bvt, - const size_t idx) { - *os << test::op::BasicOperatorData::bvt2String(bvt) << ": " << idx +template +static StreamType& PRT(StreamType *os, OperatorExecutor& obj, + const typename OperatorExecutor::BlobVectorType bvt, const size_t idx) { + *os << OperatorExecutor::bvt2String(bvt) << ": " << idx << ": "; const TBlob& blob = obj.getBlobVect(bvt)[idx]; + test::print(RunContext(), os, blob); return *os; } -template +template static StreamType& dumpF(StreamType *os, - const test::op::OpInfo& prop, + const test::op::OpInfo& prop, const size_t x = 0) { if (test::debug_output) { *os << std::endl; @@ -457,24 +486,24 @@ static StreamType& dumpF(StreamType *os, *os << "= " << x << std::endl; *os << "=============================" << std::endl; } - typedef typename test::op::BasicOperatorData::BlobVectorType BlobVectorType; - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kData); - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kGamma); - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kBeta); + typedef typename OperatorExecutor::BlobVectorType BlobVectorType; + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kData); + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kGamma); + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kBeta); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kOut); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kMean); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kVar); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kOut); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kMean); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kVar); } return *os; } -template +template static StreamType& dumpB(StreamType *os, - const test::op::OpInfo& prop, + const test::op::OpInfo& prop, const size_t x = 0) { if (test::debug_output) { *os << std::endl; @@ -484,34 +513,34 @@ static StreamType& dumpB(StreamType *os, *os << "=============================" << std::endl; } - typedef typename test::op::BasicOperatorData::BlobVectorType BlobVectorType; - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kData); - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kGamma); - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kBeta); + typedef typename OperatorExecutor::BlobVectorType BlobVectorType; + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kData); + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kGamma); + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kBeta); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar); - PRT(os, *prop.data_, BlobVectorType::kOutGrad, op::batchnorm::kOut); + PRT(os, *prop.executor_, BlobVectorType::kOutGrad, op::batchnorm::kOut); } return *os; } -template +template static StreamType& dumpF(StreamType *os, - const test::op::OpInfoPair& bi) { + const test::op::OpInfoPair& bi) { return dumpF(&dumpF(os, bi.info_1_, 1), bi.info_2_, 2); } -template +template static StreamType& dumpB(StreamType *os, - const test::op::OpInfoPair& bi) { + const test::op::OpInfoPair& bi) { return dumpB(&dumpB(os, bi.info_1_, 1), bi.info_2_, 2); } /*! \brief Test batch norm operator forward pass */ -template -static test::op::OpInfo TestBatchNormOperatorForward( +template +static test::op::OpInfo TestBatchNormOperatorForward( bool isGPU, const TShape& inputShape, const std::vector >& kwargs, @@ -524,16 +553,17 @@ static test::op::OpInfo TestBatchNormOperatorForwa isGPU = false; #endif - test::op::OpInfo info = test::op::createOpAndInfoF< - OperatorProp, BNOperatorData, DType, AccReal>(isGPU, inputShape, kwargs); + test::op::OpInfo info = test::op::createOpAndInfoF< + OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShape); - info.data_->initForward(*info.prop_, &info.in_type_); + info.executor_->initForward(*info.prop_, &info.in_type_); - info.data_->forward(count); + info.executor_->forward(count); #if !DISABLE_VALIDATION if (!isUGS(kwargs)) { - BatchNormValidator::validateForward(*info.data_); + BatchNormValidator::validateForward(*info.executor_); } #endif @@ -541,20 +571,20 @@ static test::op::OpInfo TestBatchNormOperatorForwa } /*! \brief Test batch norm operator backward pass */ -template -static test::op::OpInfo runOperatorBackward( - test::op::OpInfo *info, +template +static test::op::OpInfo runOperatorBackward( + test::op::OpInfo *info, const size_t count = 1) { - info->data_->initBackward(*info->prop_, &info->in_type_); + info->executor_->initBackward(*info->prop_, &info->in_type_); - info->data_->backward(count); + info->executor_->backward(count); return *info; } static constexpr size_t CYCLE_COUNT = 3; -template -static test::op::OpInfoPair testForwardAndBackward( +template +static test::op::OpInfoPair testForwardAndBackward( const bool isGPU1, const bool isGPU2, const TShape &inputShape, @@ -562,22 +592,25 @@ static test::op::OpInfoPair testFo const bool dumpC, const size_t count = 1, const size_t cycleCount = CYCLE_COUNT) { - test::op::OpInfo info_1 = - TestBatchNormOperatorForward(isGPU1, inputShape, + test::op::OpInfo info_1 = + TestBatchNormOperatorForward(isGPU1, inputShape, kwargs, count); - test::op::OpInfo info_2 = - TestBatchNormOperatorForward(isGPU2, inputShape, + test::op::OpInfo info_2 = + TestBatchNormOperatorForward(isGPU2, inputShape, kwargs, count); size_t thisCount = 0; + typedef typename OperatorExecutor::DataType DType; + typedef typename OperatorExecutor::AccRealType AccReal; + do { const bool isLast = thisCount == cycleCount - 1; if (thisCount) { - info_1.data_->forward(count); - info_2.data_->forward(count); + info_1.executor_->forward(count); + info_2.executor_->forward(count); } if (isLast) { @@ -588,18 +621,18 @@ static test::op::OpInfoPair testFo // Check that everything is the same after the forward pass BatchNormValidator::compare(info_1, info_2); - test::op::Validator::compare( - *info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, - op::batchnorm::kData); + BatchNormValidator::compare( + *info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, + op::batchnorm::kData, false); if (!thisCount) { // return backward runOperatorBackward(&info_1, count); runOperatorBackward(&info_2, count); } else { - info_1.data_->backward(count); - info_2.data_->backward(count); + info_1.executor_->backward(count); + info_2.executor_->backward(count); } if (isLast) { @@ -612,14 +645,13 @@ static test::op::OpInfoPair testFo } while (++thisCount < cycleCount); if (dumpC) { - info_1.data_->dumpC(&std::cerr, "BN_testForwardAndBackward"); + info_1.executor_->dumpC(&std::cerr, "BN_testForwardAndBackward"); } return { info_1, info_2 }; } - -template -static test::op::OpInfoPair +template +static test::op::OpInfoPair testForwardAndBackward(const bool isGPU, const TShape &inputShape, const test::op::kwargs_t kwargs, @@ -627,7 +659,7 @@ testForwardAndBackward(const bool isGPU, const size_t count = 1, const size_t cycleCount = CYCLE_COUNT ) { - return testForwardAndBackward( + return testForwardAndBackward( isGPU, isGPU, inputShape, @@ -637,14 +669,14 @@ testForwardAndBackward(const bool isGPU, cycleCount); } -template -static test::op::OpInfoPair +template +static test::op::OpInfoPair testBNForwardAndBackward2D(const bool isGPU, const TShape &inputShape, const test::op::kwargs_t kwargs, const bool dumpC = false) { CHECK_EQ(inputShape.ndim(), 4); // V1 can only handle 2D - return testForwardAndBackward( + return testForwardAndBackward( isGPU, isGPU, inputShape, @@ -661,7 +693,7 @@ TEST(BATCH_NORM, Test2DForwardV1V2) { DType, AccReal, { - auto infoA = testBNForwardAndBackward2D( + auto infoA = testBNForwardAndBackward2D>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); }); } @@ -675,14 +707,14 @@ TEST(BATCH_NORM, Test1DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs); }); } } TEST(BATCH_NORM, Test2DForwardV1) { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); @@ -693,7 +725,8 @@ TEST(BATCH_NORM, Test2DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - auto opInfoFloatH = TestBatchNormOperatorForward( + auto opInfoFloatH = TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); }); } @@ -704,13 +737,13 @@ TEST(BATCH_NORM, Test3DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs); }); } } -template +template static void timingTest(const std::string& label, const bool isGPU, const bool stochastic, @@ -751,22 +784,22 @@ static void timingTest(const std::string& label, const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U); - test::op::OpInfo info; + test::op::OpInfo info; switch (D) { case 0: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, width}, kwargs, count); break; case 1: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, height, width}, kwargs, count); break; case 2: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, depth, height, width}, kwargs, count); @@ -774,9 +807,9 @@ static void timingTest(const std::string& label, default: CHECK(false) << "rangedRand() returned unexpected value"; } - if (info.data_.get()) { - runOperatorBackward(&info, count); - timing += info.data_->timing_; + if (info.executor_.get()) { + runOperatorBackward(&info, count); + timing += info.executor_->GetTiming(); } } while (false); @@ -795,19 +828,17 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - timingTest("RANDOM: BatchNormProp", - false, true, - blank_kwargs_nocudnn, - GPU_TEST_DIMENSIONS); }); + timingTest>( + "RANDOM: BatchNormProp", false, true, + blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); }); #if MXNET_USE_CUDA if (test::unitTestsWithCuda) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - timingTest("RANDOM: BatchNormProp", - true, true, - blank_kwargs_nocudnn, - GPU_TEST_DIMENSIONS); }); + timingTest>( + "RANDOM: BatchNormProp", true, true, + blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); }); } #endif } @@ -823,42 +854,47 @@ TEST(BATCH_NORM, TestTiming_2D) { THISCOUNT = 1; } MSHADOW_REAL_TYPE_SWITCH_EX( - mshadow::kFloat32, DType, AccReal, - { - timingTest("BatchNormV1Prop 2D", - false, false, - blank_kwargs, - 2, THISCOUNT); -#if MXNET_USE_MKL2017 == 1 - timingTest("MKL BatchNormProp 2D", - false, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); -#endif + mshadow::kFloat32, DType, AccReal, { + timingTest>( + "BatchNormV1Prop 2D", + false, false, + blank_kwargs, + 2, THISCOUNT); + #if MXNET_USE_MKL2017 == 1 + timingTest>( + "MKL BatchNormProp 2D", + false, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); + #endif test::ScopeSet disableMKL(&mxnet::op::batchnorm::disable_mkl, true); - timingTest("BatchNormProp 2D", - false, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); -#if MXNET_USE_CUDA + timingTest>( + "BatchNormProp 2D", + false, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); + #if MXNET_USE_CUDA if (test::unitTestsWithCuda) { - timingTest("BatchNormV1Prop 2D", - true, false, - blank_kwargs, - 2, THISCOUNT); - timingTest("BatchNormProp 2D", - true, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - timingTest("CUDNN BatchNormProp 2D", - true, false, - blank_kwargs, - 2, THISCOUNT); -#endif + timingTest>( + "BatchNormV1Prop 2D", + true, false, + blank_kwargs, + 2, THISCOUNT); + timingTest>( + "BatchNormProp 2D", + true, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); + #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + timingTest>( + "CUDNN BatchNormProp 2D", + true, false, + blank_kwargs, + 2, THISCOUNT); + #endif } -#endif - }); + #endif + }); } /** @@ -867,8 +903,8 @@ TEST(BATCH_NORM, TestTiming_2D) { template struct BothInfo { - test::op::OpInfo info_v1_; - test::op::OpInfo info_; + test::op::OpInfo> info_v1_; + test::op::OpInfo> info_; }; TEST(BATCH_NORM, TestBackward2D_Simple) { @@ -876,8 +912,10 @@ TEST(BATCH_NORM, TestBackward2D_Simple) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair< + op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor> bi = + testForwardAndBackward< + op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor>( false, inputShape, blank_kwargs); // Keep it simple }); } @@ -905,14 +943,16 @@ TEST(BATCH_NORM, TestIterAll) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - test::op::OpInfoPair + test::op::OpInfoPair> bi = testForwardAndBackward( + BNOperatorExecutor>( g1 != 0, g2 != 0, shape, kwargs, false); // Keep it simple if (shape.ndim() == 4 && type == mshadow::kFloat32 && !x3) { - test::op::OpInfoPair + test::op::OpInfoPair> bi = testForwardAndBackward( + BNOperatorExecutor>( g1 != 0, g2 != 0, shape, kwargs, false); // Keep it simple } }); @@ -942,21 +982,21 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) TShape shapes[2] = {2, 3}; const TShape inputShape({2, 3}); - test::op::OpInfo info_1 = test::op::createOpAndInfoF< - op::BatchNormV1Prop, - BNOperatorData, - DType, AccReal>(gpu_V1, inputShape, kwargs); + test::op::OpInfo> info_1 = + test::op::createOpAndInfoF< + op::BatchNormV1Prop, BNOperatorExecutor>( + kwargs, gpu_V1, inputShape); - test::op::OpInfo info_2 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - gpu_V2, inputShape, kwargs); + test::op::OpInfo> info_2 = + test::op::createOpAndInfoF>( + kwargs, gpu_V2, inputShape); - info_1.data_->initForward(*info_1.prop_, &info_1.in_type_); - info_2.data_->initForward(*info_1.prop_, &info_1.in_type_); - info_1.data_->initBackward(*info_1.prop_, &info_1.in_type_); - info_2.data_->initBackward(*info_1.prop_, &info_1.in_type_); + info_1.executor_->initForward(*info_1.prop_, &info_1.in_type_); + info_2.executor_->initForward(*info_1.prop_, &info_1.in_type_); + info_1.executor_->initBackward(*info_1.prop_, &info_1.in_type_); + info_2.executor_->initBackward(*info_1.prop_, &info_1.in_type_); - TBlob &blob1 = info_1.data_->c_.blob_input_vec_[op::batchnorm::kData]; + TBlob &blob1 = info_1.executor_->inputs()[op::batchnorm::kData]; test::data_ref(&blob1, {0, 0}) = -0.05f; test::data_ref(&blob1, {0, 1}) = -0.19f; test::data_ref(&blob1, {0, 2}) = 0.02f; @@ -964,7 +1004,7 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) test::data_ref(&blob1, {1, 1}) = 0.06f; test::data_ref(&blob1, {1, 2}) = -0.01f; - TBlob &blob2 = info_2.data_->c_.blob_input_vec_[op::batchnorm::kData]; + TBlob &blob2 = info_2.executor_->inputs()[op::batchnorm::kData]; test::data_ref(&blob2, {0, 0}) = -0.05f; test::data_ref(&blob2, {0, 1}) = -0.19f; test::data_ref(&blob2, {0, 2}) = 0.02f; @@ -972,20 +1012,20 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) test::data_ref(&blob2, {1, 1}) = 0.06f; test::data_ref(&blob2, {1, 2}) = -0.01f; - test::data_ref(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3; - test::data_ref(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3; + test::data_ref(&info_1.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3; + test::data_ref(&info_2.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3; - test::data_ref(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3; - test::data_ref(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3; + test::data_ref(&info_1.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3; + test::data_ref(&info_2.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3; for (size_t x = 0; x < count; ++x) { - info_1.data_->forward(); - info_2.data_->forward(); + info_1.executor_->forward(); + info_2.executor_->forward(); BatchNormValidator::compare(info_1, info_2); - info_1.data_->backward(); - info_2.data_->backward(); + info_1.executor_->backward(); + info_2.executor_->backward(); BatchNormValidator::compare(info_1, info_2); } @@ -1009,8 +1049,10 @@ TEST(BATCH_NORM, TestBackward2D_SimpleNFG) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, nonfixgamma_kwargs); }); } @@ -1021,8 +1063,10 @@ TEST(BATCH_NORM, Test2DBackward_Complex) { { test::ScopeSet noDebugOutput(&test::debug_output, false); const TShape inputShape({9, 14, 16, 91}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, blank_kwargs); }); } @@ -1031,12 +1075,14 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil { template static void test() { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, blank_kwargs, false, 1, 5); #if MXNET_DUMP_C - bi.info_1_.data_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic"); + bi.info_1_.executor_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic"); #endif static const std::vector< std::vector< std::vector > > @@ -1067,17 +1113,17 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil { // Expected data state when running forward+backward starting with default values // Note: This data structure generated by dumpC() // Test loaded data agsinst calculated data - test::op::OpInfo info_checkLoad = - test::op::createOpAndInfoF, - DType, AccReal>(false, inputShape, blank_kwargs); - info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->load(___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___); + test::op::OpInfo> info_checkLoad = + test::op::createOpAndInfoF>( + blank_kwargs, false, inputShape); + info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->load( + ___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___); BatchNormValidator::compare(bi.info_1_, info_checkLoad); } }; - TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) { test::ScopeSet disableMKL(&mxnet::op::batchnorm::disable_mkl, true); MSHADOW_REAL_TYPE_SWITCH_EX( @@ -1087,17 +1133,20 @@ TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) { }); } -template +template void compare(const bool isGPU, - const test::op::OpInfo& object, - const std::vector< std::vector< std::vector > >& values) { - test::op::OpInfo info_checkLoad = - test::op::createOpAndInfoF, DType, AccReal>( - isGPU, object.data_->c_.blob_input_vec_[0].shape_, blank_kwargs); - info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->load(values); - BatchNormValidator::compare(object, info_checkLoad); + const test::op::OpInfo& object, + const std::vector< + std::vector< std::vector > >& values) { + test::op::OpInfo info_checkLoad = + test::op::createOpAndInfoF( + blank_kwargs, isGPU, object.executor_->inputs()[0].shape_); + info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->load(values); + BatchNormValidator< + typename OperatorExecutor::DataType, + typename OperatorExecutor::AccRealType>::compare(object, info_checkLoad); } TEST(BATCH_NORM, TestBackward1D_Simple) { @@ -1105,15 +1154,14 @@ TEST(BATCH_NORM, TestBackward1D_Simple) { mshadow::kFloat32, DTypeX, AccReal, { const TShape inputShape({1, 1, 2}); - test::op::OpInfo info = - TestBatchNormOperatorForward(false, - inputShape, - blank_kwargs); - info.data_->initBackward(*info.prop_, &info.in_type_); + test::op::OpInfo> info = + TestBatchNormOperatorForward>( + false, inputShape, blank_kwargs); + info.executor_->initBackward(*info.prop_, &info.in_type_); runOperatorBackward(&info); #if MXNET_DUMP_C - info.data_->dumpC(&std::cerr, "BN_TestBackward1D_Simple"); + info.executor_->dumpC(&std::cerr, "BN_TestBackward1D_Simple"); #endif // Expected data state when running forward+backward starting with default values @@ -1152,13 +1200,13 @@ TEST(BATCH_NORM, TestBackward3D) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({2, 3, 2, 3, 5}); - test::op::OpInfo info = - TestBatchNormOperatorForward( + test::op::OpInfo> info = + TestBatchNormOperatorForward>( false, inputShape, blank_kwargs); - info.data_->initBackward(*info.prop_, &info.in_type_); + info.executor_->initBackward(*info.prop_, &info.in_type_); runOperatorBackward(&info); #if MXNET_DUMP_C - info.data_->dumpC(&std::cerr, "TestBackward3D"); + info.executor_->dumpC(&std::cerr, "TestBackward3D"); #endif }); } @@ -1169,8 +1217,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_nfg) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, nonfixgamma_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1183,8 +1233,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_ugs) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, useglobalstats_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1281,8 +1333,8 @@ static void compare(const TBlob& blob, const std::vector& vals) { for (size_t i = 0, n = vals.size(); i < n; ++i) { const DType vBlob = v[i]; const DType vVect = vals[i]; - const bool near = test::op::Validator::isNear( - vBlob, vVect, test::op::Validator::ErrorBound(&blob)); + const bool near = BatchNormValidator::isNear( + vBlob, vVect, BatchNormValidator::ErrorBound(&blob)); EXPECT_TRUE(near); if (!near) { LOG(WARNING) << vBlob << " is not near enough to " << vVect << std::endl; @@ -1301,8 +1353,8 @@ static void compare(const std::vector>& d1, for (size_t i = 0, n = vec1.size(); i < n; ++i) { const DType v1 = vec1[i]; const DType v2 = vec2[i]; - const bool near = test::op::Validator::isNear( - v1, v2, test::op::Validator::ERROR_BOUND()); + const bool near = BatchNormValidator::isNear( + v1, v2, BatchNormValidator::ERROR_BOUND()); EXPECT_TRUE(near); if (!near) { LOG(WARNING) << v1 << " is not near enough to " << v2 << std::endl; @@ -1442,66 +1494,67 @@ static void runChannelAxisTest( // Create operator 1 with ChannelAxis2 (normally the experimental one) kwargs.push_back({"axis", std::to_string(channelAxis1)}); - test::op::OpInfo info_c1 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - isGPU1, shape_c1, kwargs); + test::op::OpInfo> info_c1 = + test::op::createOpAndInfoF< + op::BatchNormProp, BNOperatorExecutor>( + kwargs, isGPU1, shape_c1); // Create operator 2 with ChannelAxis2 (normally the control one) kwargs.pop_back(); kwargs.push_back({"axis", std::to_string(channelAxis2)}); - test::op::OpInfo info_c2 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - isGPU2, shape_c2, kwargs); + test::op::OpInfo> info_c2 = + test::op::createOpAndInfoF>( + kwargs, isGPU2, shape_c2); kwargs.pop_back(); // Init operators - info_c1.data_->initForward(*info_c1.prop_, &info_c1.in_type_); - info_c1.data_->initBackward(*info_c1.prop_, &info_c1.in_type_); - info_c2.data_->initForward(*info_c2.prop_, &info_c2.in_type_); - info_c2.data_->initBackward(*info_c2.prop_, &info_c2.in_type_); + info_c1.executor_->initForward(*info_c1.prop_, &info_c1.in_type_); + info_c1.executor_->initBackward(*info_c1.prop_, &info_c1.in_type_); + info_c2.executor_->initForward(*info_c2.prop_, &info_c2.in_type_); + info_c2.executor_->initBackward(*info_c2.prop_, &info_c2.in_type_); // Save input data to blob with new shape 1 - data_c1.save(info_c1.data_->c_.blob_input_vec_[0], channelAxis1); - ChannelAxisTestData::print("blob 1 input", info_c1.data_->c_.blob_input_vec_[0]); + data_c1.save(info_c1.executor_->inputs()[0], channelAxis1); + ChannelAxisTestData::print("blob 1 input", info_c1.executor_->inputs()[0]); // Save input data to blob with new shape 2 - data_c2.save(info_c2.data_->c_.blob_input_vec_[0], channelAxis2); - ChannelAxisTestData::print("blob 2 input", info_c2.data_->c_.blob_input_vec_[0]); + data_c2.save(info_c2.executor_->inputs()[0], channelAxis2); + ChannelAxisTestData::print("blob 2 input", info_c2.executor_->inputs()[0]); // Save output grad to blob with new shape 1 - grad_c1.save(info_c1.data_->c_.blob_out_grad_[0], channelAxis1); - ChannelAxisTestData::print("blob 1 output grad", info_c1.data_->c_.blob_out_grad_[0]); + grad_c1.save(info_c1.executor_->bwd_inputs()[0], channelAxis1); + ChannelAxisTestData::print("blob 1 output grad", info_c1.executor_->bwd_inputs()[0]); // Save output grad to blob with new shape 2 - grad_c2.save(info_c2.data_->c_.blob_out_grad_[0], channelAxis2); - ChannelAxisTestData::print("blob 2 output grad", info_c2.data_->c_.blob_out_grad_[0]); + grad_c2.save(info_c2.executor_->bwd_inputs()[0], channelAxis2); + ChannelAxisTestData::print("blob 2 output grad", info_c2.executor_->bwd_inputs()[0]); // Run both operators forward and backwards several times for (index_t x = 0; x < numberOfPasses; ++x) { - info_c1.data_->forward(); - info_c2.data_->forward(); + info_c1.executor_->forward(); + info_c2.executor_->forward(); - info_c1.data_->backward(); - info_c2.data_->backward(); + info_c1.executor_->backward(); + info_c2.executor_->backward(); } // Transform operator 1's blob output to a normalized shape - data_c1.load(info_c1.data_->c_.blob_output_vec_[0], channelAxis1); + data_c1.load(info_c1.executor_->outputs()[0], channelAxis1); ChannelAxisTestData::print("channel data 1", data_c1.channel_data_); // Transform operator 2's blob output to a normalized shape - data_c2.load(info_c2.data_->c_.blob_output_vec_[0], channelAxis2); + data_c2.load(info_c2.executor_->outputs()[0], channelAxis2); ChannelAxisTestData::print("channel data 2", data_c2.channel_data_); // Compare the operators' output data while they're in a normalized shape compare(data_c1.channel_data_, data_c2.channel_data_); // Transform operator 1's input-grad blob to a normalized shape - grad_c1.load(info_c1.data_->c_.blob_in_grad_[0], channelAxis1); + grad_c1.load(info_c1.executor_->bwd_outputs()[0], channelAxis1); ChannelAxisTestData::print("input grad 1", grad_c1.channel_data_); // Transform operator 2's input-grad blob to a normalized shape - grad_c2.load(info_c2.data_->c_.blob_in_grad_[0], channelAxis2); + grad_c2.load(info_c2.executor_->bwd_outputs()[0], channelAxis2); ChannelAxisTestData::print("input grad 2", grad_c2.channel_data_); // Compare the operators' input grad data while they're in a normalized shape @@ -1576,11 +1629,11 @@ TEST(BATCH_NORM, Test2DForwardV12D_gpu) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); @@ -1592,11 +1645,11 @@ TEST(BATCH_NORM, Test2DForward2D_gpu) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs_nocudnn); @@ -1610,7 +1663,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1_gpu_cpu) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); }); } @@ -1620,7 +1674,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1Complex_gpu_cpu) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); }); } @@ -1631,9 +1686,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu) { type, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs_nocudnn, false); }); } @@ -1645,9 +1702,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs_nocudnn, false); }); } @@ -1661,7 +1720,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_nfg) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, false, inputShape, nonfixgamma_kwargs, false); }); } @@ -1672,9 +1732,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_nfg) { type, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs_nocudnn, false); }); } @@ -1686,9 +1748,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_nfg) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs_nocudnn, false); }); } @@ -1702,8 +1766,10 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_ugs) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, useglobalstats_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1716,9 +1782,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_ugs) { type, DType, AccReal, { const TShape inputShape({2, 3, 2, 2}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs_nocudnn, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs, false); }); } @@ -1730,13 +1798,14 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs_nocudnn, false); }); } } #endif // MXNET_USE_CUDA - diff --git a/tests/cpp/operator/core_op_runner_test.cc b/tests/cpp/operator/core_op_runner_test.cc new file mode 100644 index 000000000000..89270eb1dd0d --- /dev/null +++ b/tests/cpp/operator/core_op_runner_test.cc @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file core_op_runner.cc + * \brief Test operator runner (unary and binary ops validated here) + * \note This is NOT where you test your operator performance. These tests validate that + * the testing framework is functional. + * \author Chris Olivier + */ + +#include +#include +#include +#include "../../src/imperative/imperative_utils.h" +#include "../include/test_compute.h" +#include "../include/test_op_runner.h" +#include "../include/test_core_op.h" + +using namespace mxnet; + +using kwargs_t = test::op::kwargs_t; + +static const kwargs_t basic_args = {}; + +static const std::vector> test_unary_operators = + { + {"relu", "" }, // Code can figure out what the backward op is for some + {"sigmoid", "" }, + { "sqrt", "" } + }; + + +static const std::vector> test_binary_operators = + { + {"elemwise_add", "_backward_add"}, + {"elemwise_mul", "_backward_mul"} + }; + +/*! + * \brief Generic bidirectional sanity test for simple unary op + */ +TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnary) { + TShape shape({5, 5}); + kwargs_t kwargs = basic_args; + + for (const std::pair& i : test_unary_operators) { + const char *op_name = i.first.c_str(); + const char *backward_op_name = i.second.c_str(); + + test::op::CoreOpExecutor op(false, shape); + op.set_verbose(false); + + op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name)); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + op.Execute(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + op.ExecuteBackward(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + } +} + +/*! + * \brief Generic bidirectional sanity test for binary op + */ +TEST(CORE_OP_RUNNER, ExecuteBidirectional) { + for (const std::pair& i : test_binary_operators) { + const char *op_name = i.first.c_str(); + const char *backward_op_name = i.second.c_str(); + + TShape shape({5, 5}); + kwargs_t kwargs = basic_args; + + test::op::CoreOpExecutor op(false, shape); + + op.set_verbose(false); + op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name)); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + op.Execute(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + op.ExecuteBackward(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + } +} + +TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerSimpleUnary) { + typedef float DType; + TShape shape({5, 5}); + for (const std::pair& i : test_unary_operators) { + const char *op_name = i.first.c_str(); + const char *backward_op_name = i.second.c_str(); + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, shape, test::op::CoreOpExecutor::ArgsWithOpName( + basic_args, op_name, backward_op_name), 1); + } +} + +TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunner) { + typedef float DType; + TShape shape({5, 5}); + for (const std::pair& i : test_binary_operators) { + const char *op_name = i.first.c_str(); + const char *backward_op_name = i.second.c_str(); + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, shape, test::op::CoreOpExecutor::ArgsWithOpName( + basic_args, op_name, backward_op_name), 1); + } +} + +/*! + * \brief Timing tests for CPU + */ +TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) { + typedef float DType; + + const char *op_name = "relu"; + + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName(basic_args, op_name); + + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, {10, 10, 10, 10}, + kwargs, + 1); // prime code and cache + + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, shape); + } +} + +TEST(CORE_OP_RUNNER, TimingCPUBinary) { + typedef float DType; + + const char *op_name = "elemwise_add"; + const char *backward_op_name = "_backward_add"; + + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( + basic_args, op_name, backward_op_name); + + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, {10, 10, 10, 10}, + kwargs, + 1); // prime code and cache + + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, shape); + } +} + +#if MXNET_USE_CUDA == 1 +TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) { + typedef float DType; + + const char *op_name = "relu"; + + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName(basic_args, op_name); + + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, {10, 10, 10, 10}, + kwargs, + 1); // prime code and cache + + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, shape); + }} + +TEST(CORE_OP_RUNNER, TimingGPUBinary) { + typedef float DType; + + const char *op_name = "elemwise_add"; + const char *backward_op_name = "_backward_add"; + + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( + basic_args, op_name, backward_op_name); + + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(true, {10, 10, 10, 10}, + kwargs, + 1); // prime code and cache + + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, shape); + } +} + +#endif // MXNET_USE_CUDA == 1 diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index 4cb4b4522a96..9c182222809c 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -28,21 +28,20 @@ #include #include "../../src/operator/fully_connected-inl.h" #include "../include/test_op_runner.h" +#include "../include/test_legacy_op.h" using namespace mxnet; typedef std::vector > kwargs_t; const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} }; - /*! * \brief Generic bidirectional sanity test */ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { TShape shape({5, 5}); kwargs_t kwargs = basic_fullyconn_args; - test::OperatorRunner> runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, shape, kwargs, 1); } @@ -51,8 +50,7 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; - test::OperatorRunner> - runner; + test::op::LegacyOpRunner runner; runner.RunBidirectional(false, {10, 10, 10, 10}, kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { @@ -80,14 +78,25 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { kwargs_t kwargs = basic_fullyconn_args; - test::op::OpInfo info; test::OperatorRunner> runner; + test::op::LegacyOperatorExecutor> + runner; runner.RunBidirectional(true, {10, 10, 10, 10}, kwargs, 1); // prime code and cache - const std::vector shapes = { - {1, 1, 28, 28}, {1, 3, 28, 28}, - {50, 1, 18, 32}, {50, 3, 18, 32} - }; + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } for (const TShape& shape : shapes) { runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, shape); } From 9f480ee7d9930cfa08b4d8f85d893088bc152def Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Wed, 25 Oct 2017 20:41:13 -0700 Subject: [PATCH 144/237] Remove obsolete file --- tests/cpp/include/test_compute.h | 98 ----------------------- tests/cpp/operator/core_op_runner_test.cc | 1 - 2 files changed, 99 deletions(-) delete mode 100644 tests/cpp/include/test_compute.h diff --git a/tests/cpp/include/test_compute.h b/tests/cpp/include/test_compute.h deleted file mode 100644 index 0100fde0a3db..000000000000 --- a/tests/cpp/include/test_compute.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file test_op.h - * \brief operator unit test utility functions - * \author Chris Olivier - * - * These classes offer a framework for developing, testing and debugging operators - * in C++. They work for both CPU and GPU modes, as well as offer a timing - * infrastructure in order to test inidividual operator performance. - * - * Operator data can be validated against general logic, - * stored scalar values (which can be generated by this code from an existing operator via - * BasicOperatorData::dumpC(), as well as against each other (ie check that - * GPU, CPU, MKL, and CUDNN operators produce the same output given the same input. - * - * test_util.h: General testing utility functionality - * test_perf.h: Performance-related classes - * test_op.h: Operator-specific testing classes - */ -#ifndef TEST_COMPUTE_H_ -#define TEST_COMPUTE_H_ - -#include "../../src/common/utils.h" - -namespace mxnet { -namespace test { -namespace op { - -class ComputeOp /*: public Operator*/ { - public: - ComputeOp(const OpContext &ctx, const nnvm::NodeAttrs& attrs) - : attrs_(attrs) - , forward_(nullptr) - , backward_(nullptr) { - // TODO(cjolivier01): Determine backward op from op - //common::GetFCompute(attrs.op, "FCompute", ctx) - } - - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - if(forward_) { - forward_(attrs_, ctx, in_data, req, out_data); - } - } - - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - if(backward_) { - backward_(attrs_, ctx, { in_data }, req, in_grad); - } - } - -// virtual std::string TypeString() const { -// return "ComputeOp"; -// } - - protected: - nnvm::NodeAttrs attrs_; - FCompute forward_; - FCompute backward_; -}; - -template -class ComputeOpProp /*: public OperatorProperty*/ { - -}; - -} // namespace op -} // namespace test -} // namespace mxnet - -#endif // TEST_OP_H_ diff --git a/tests/cpp/operator/core_op_runner_test.cc b/tests/cpp/operator/core_op_runner_test.cc index 89270eb1dd0d..453b26ce665a 100644 --- a/tests/cpp/operator/core_op_runner_test.cc +++ b/tests/cpp/operator/core_op_runner_test.cc @@ -29,7 +29,6 @@ #include #include #include "../../src/imperative/imperative_utils.h" -#include "../include/test_compute.h" #include "../include/test_op_runner.h" #include "../include/test_core_op.h" From 92dd85f84428ac0d3342269cd359c688cce9927a Mon Sep 17 00:00:00 2001 From: cjolivier01 Date: Wed, 25 Oct 2017 20:47:04 -0700 Subject: [PATCH 145/237] Fix compile error for non-CUDA build --- tests/cpp/include/test_core_op.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 0ed8b7e717a6..a1b0dcd833b8 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -51,6 +51,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer */ template static inline void AccessAsCPU(const NDArray &src, const RunContext &run_ctx, CallbackFunction cb) { +#if MXNET_USE_CUDA if(src.ctx().dev_type == Context::kCPU) { cb(src); } else { @@ -65,6 +66,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer TBlob tmp2 = src.data(); mxnet::ndarray::Copy(on_cpu.data(), &tmp2, gpu_ctx, cpu_ctx, run_ctx); } +#else + cb(src); +#endif } /*! @@ -165,6 +169,12 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer return res; } + /*! + * \brief Attach any temp or tandom resources required to perform the op's compute operation + * \param ctx Operator context object + * \param attrs NodeAttrs structure (node attributes) + * \param op Pointer to nnvm Operator object + */ void AttachResources(OpContext &ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) { static auto& fresource = nnvm::Op::GetAttr("FResourceRequest"); if (fresource.count(op) != 0) { @@ -236,12 +246,18 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer ctx_.is_train = true; ctx_.run_ctx.ctx.dev_id = 0; ctx_.run_ctx.stream = nullptr; + ctx_.run_ctx.ctx.dev_type = Context::kCPU; +#if MXNET_USE_CUDA if (isGPU) { ctx_.run_ctx.ctx.dev_type = Context::kGPU; allocGPUStream_.reset(new GPUStreamScope(&ctx_)); } else { ctx_.run_ctx.ctx.dev_type = Context::kCPU; } +#else + CHECK(!isGPU); + ctx_.run_ctx.ctx.dev_type = Context::kCPU; +#endif } /*! @@ -494,10 +510,14 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer * \brief This operator's context object */ OpContext ctx_; + + #if MXNET_USE_CUDA /*! \brief * Scoped GPU stream */ std::unique_ptr allocGPUStream_; + #endif + /*! * \brief Input data shape */ From 505d3e7a3a2a9939e39d9b2b6337efb85f7d8d3f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 26 Oct 2017 01:03:23 -0700 Subject: [PATCH 146/237] tweaks in quantize --- src/operator/contrib/two_bit_quantize-inl.h | 49 +++++++++++---------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 3f3e7f75a4ec..bc9b14c35e3d 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -107,27 +107,27 @@ inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, return true; } -struct init_threshold_2bit { - MSHADOW_XINLINE static void Map(int server_id, - float *out, - const float neg_threshold, - const float pos_threshold, - ps::SArray compr_sizes, - ps::SArray orig_sizes) { - // i for each server - size_t curr_pos = 0; - for (int i=0; i(orig_sizes[server_id]); - } -}; +//struct init_threshold_2bit { +// MSHADOW_XINLINE static void Map(int server_id, +// float *out, +// const float neg_threshold, +// const float pos_threshold, +// ps::SArray compr_sizes, +// ps::SArray orig_sizes) { +// // i for each server +// size_t curr_pos = 0; +// for (int i=0; i(orig_sizes[server_id]); +// } +//}; struct quantize_2bit { MSHADOW_XINLINE static void Map(int out_block_id, @@ -270,11 +270,14 @@ struct dequantize_2bit { for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { ch_ptr += !(i & 3); int col = i & 3; - if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { *outval = pos_threshold; } // use posbits for mask as posbits are 11 // compare with negbits - else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { + else if ( masked == negmask ) { *outval = neg_threshold; } else { *outval = 0; From 51d03499b6f927a6b631b5b74a8a737a41cc1e3b Mon Sep 17 00:00:00 2001 From: Olivier Date: Thu, 26 Oct 2017 14:58:19 -0700 Subject: [PATCH 147/237] Allow for no backward pass --- tests/cpp/include/test_core_op.h | 41 ++++++--- tests/cpp/include/test_legacy_op.h | 6 ++ tests/cpp/include/test_op_runner.h | 14 ++- tests/cpp/operator/coreop_perf.cc | 139 +++++++++++++++++++++++++++++ 4 files changed, 183 insertions(+), 17 deletions(-) create mode 100644 tests/cpp/operator/coreop_perf.cc diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index a1b0dcd833b8..7a53f1de8a96 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -20,14 +20,16 @@ #define TEST_CORE_OP_H_ #include "./test_op.h" +#include "../../../src/imperative/imperative_utils.h" namespace mxnet { namespace test { namespace op { // Tried making this a struct w/constexpr, but getting undefined reference on gcc 5.4.1 -#define COREOP_FWD_OP_NAME_KEY "fwd_op_name" -#define COREOP_BWD_OP_NAME_KEY "bwd_op_name" +#define COREOP_FWD_OP_NAME_KEY "fwd_op_name" +#define COREOP_BWD_OP_NAME_KEY "bwd_op_name" +#define COREOP_BWD_OP_NAME_VALUE_NONE "" /*! * Low-noise operator executor @@ -357,22 +359,30 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer AttachResources(ctx_, attrs_, op_); if(!backward_for_op) { + bool no_backward = false; // Set up backward std::vector, std::string>> bwd; if (!bwd_op_name.empty()) { - // Backward op was specified - std::shared_ptr pOp = std::make_shared( - ctx().run_ctx.ctx.dev_type == Context::kGPU, this->outputs()[0].shape()); - bwd.push_back({pOp, bwd_op_name}); + if(bwd_op_name != COREOP_BWD_OP_NAME_VALUE_NONE) { + // Backward op was specified + std::shared_ptr pOp = std::make_shared( + ctx().run_ctx.ctx.dev_type == Context::kGPU, this->outputs()[0].shape()); + bwd.push_back({pOp, bwd_op_name}); + } else { + no_backward = true; + } } else { // Try to figure out backward op bwd = GetBackward(); } - CHECK_GE(bwd.size(), 1U); - for (std::pair, std::string> &bw_item : bwd) { - bw_item.first->set_verbose(verbose_); - backward_.push_back(bw_item.first); - bw_item.first->Init(ArgsWithOpName(args, bw_item.second), {}, {}, this); + if(!no_backward) { + CHECK_GE(bwd.size(), 1U) + << "Can't automatically determine backward op name. Please specify"; + for (std::pair, std::string> &bw_item : bwd) { + bw_item.first->set_verbose(verbose_); + backward_.push_back(bw_item.first); + bw_item.first->Init(ArgsWithOpName(args, bw_item.second), {}, {}, this); + } } } } @@ -396,6 +406,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer } inline void backward(const size_t count) { + CHECK(HasBackward()); perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count); VTuneResume profile; for (size_t i = 0; i < count; ++i) { @@ -421,12 +432,16 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer functionex_(attrs_, ctx_, inputs_, req_, outputs_); } + bool HasBackward() const { + return !backward_.empty(); + } + /*! * \brief Execute backward pass on operator */ bool ExecuteBackward() { CHECK_EQ(initialized_, true); - CHECK(!backward_.empty()); + CHECK(HasBackward()); if (!backward_.empty()) { // Avoid locked ref count here for (std::shared_ptr &p : backward_) { @@ -442,7 +457,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer */ bool ExecuteBackwardEx() { CHECK_EQ(initialized_, true); - CHECK(!backward_.empty()); + CHECK(HasBackward()); if (!backward_.empty()) { // Avoid locked ref count here for (std::shared_ptr &p : backward_) { diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h index 76f4713c46a6..bba15cabb9a2 100644 --- a/tests/cpp/include/test_legacy_op.h +++ b/tests/cpp/include/test_legacy_op.h @@ -228,6 +228,12 @@ class LegacyOperatorExecutor : public OperatorDataInitializer } } + /*! + * \brief Test if operator has a backward pass + * \return true if this operator has a backward pass + */ + MSHADOW_CINLINE bool HasBackward() const { return true; } + /*! \brief Getter functions for the operator */ inline Operator *op() { return op_.get(); } inline const Operator *op() const { return op_.get(); } diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 06a5a2e6d217..14703e79ca5e 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -83,6 +83,7 @@ class OperatorRunner { test::op::OpInfo RunGenericOperatorBackward( test::op::OpInfo *info, const size_t count = 1) { + CHECK(info->executor_->HasBackward()); info->executor_->initBackward(*info->prop_, &info->in_type_); info->executor_->backward(count); return *info; @@ -104,7 +105,10 @@ class OperatorRunner { const size_t count = 1) { test::op::OpInfo info = RunGenericOperatorForward(isGPU, inputShape, kwargs, count); - return RunGenericOperatorBackward(&info, count); + if(info.executor_->HasBackward()) { + return RunGenericOperatorBackward(&info, count); + } + return info; } /*! @@ -203,11 +207,13 @@ class OperatorRunner { default: CHECK(false) << "Unsupported dimension count: " << (D + 1); } - if (info.executor_.get()) { - RunGenericOperatorBackward(&info, count); + if (info.executor_) { + if(info.executor_->HasBackward()) { + RunGenericOperatorBackward(&info, count); + } timing += info.executor_->GetTiming(); } - } while (false); + } timing.print(&std::cout, label); std::cout << std::endl << std::flush; diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc new file mode 100644 index 000000000000..590f1b61a90f --- /dev/null +++ b/tests/cpp/operator/coreop_perf.cc @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file activation_perf.cc + * \brief Perf/profile run of ActivationOp + * \author Chris Olivier + */ + +#include +#include +#include "../../src/operator/activation-inl.h" +#include "../include/test_op_runner.h" +#include "../include/test_core_op.h" + +using namespace mxnet; + +using kwargs_t = test::op::kwargs_t; + +template +static void RunCoreOpBidirectional(const bool isGPU, + const kwargs_t& op_kwargs, + const char *op_name, + const char *backward_op_name = "") { + const TShape shape({5, 5}); + test::op::CoreOpExecutor op(isGPU, shape); + op.set_verbose(false); + + op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name)); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + op.Execute(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + if(op.HasBackward()) { + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + op.ExecuteBackward(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + } +} + + +template +static void RunCoreOpTimingTest(const bool isGPU, + const kwargs_t& op_kwargs, + const char *op_name, + const char *backward_op_name = "") { + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( + op_kwargs, op_name, backward_op_name); + + // prime code and cache before the performance runs + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, {20, 3, 128, 128}, kwargs, 1); + + // Do the performance runs + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + const char *pu = isGPU ? "GPU" : "CPU"; + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, shape); + } +} + +const kwargs_t basic_args = { }; + +/*! + * \brief Generic bidirectional sanity test + */ +TEST(COREOP_PERF, ExecuteBidirectional) { + std::cout << "NEGATIVE CLIP GRADIENT" << std::endl; + RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); + std::cout << "POSITIVE CLIP GRADIENT" << std::endl; + RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); +} + +/*! + * \brief ActivationOp timing test for CPU + */ +TEST(COREOP_PERF, TimingCPU) { + std::cout << "NEGATIVE CLIP GRADIENT" << std::endl; + RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); + std::cout << "POSITIVE CLIP GRADIENT" << std::endl; + RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); +} + +#if MXNET_USE_CUDA == 1 +/*! + * \brief ActivationOp timing test for GPU + */ +TEST(COREOP_PERF, TimingGPU) { + std::cout << "NEGATIVE CLIP GRADIENT" << std::endl; + RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "-1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); + std::cout << "POSITIVE CLIP GRADIENT" << std::endl; + RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "1" } }, + "sgd_mom_update", + COREOP_BWD_OP_NAME_VALUE_NONE); +} +#endif // MXNET_USE_CUDA == 1 + From 3e17ec382c3184b0a1a56ba3478cf17b44da4aed Mon Sep 17 00:00:00 2001 From: Olivier Date: Thu, 26 Oct 2017 15:41:40 -0700 Subject: [PATCH 148/237] Remove unused var --- tests/cpp/operator/coreop_perf.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc index 590f1b61a90f..66ca6faecd68 100644 --- a/tests/cpp/operator/coreop_perf.cc +++ b/tests/cpp/operator/coreop_perf.cc @@ -91,8 +91,6 @@ static void RunCoreOpTimingTest(const bool isGPU, } } -const kwargs_t basic_args = { }; - /*! * \brief Generic bidirectional sanity test */ From 248908ca076057bfbc6369c24e2fa78815425cc4 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 26 Oct 2017 17:11:46 -0700 Subject: [PATCH 149/237] making quantize all compatible as operators --- include/mxnet/ndarray.h | 2 +- src/kvstore/kvstore_dist_server.h | 8 +-- src/ndarray/ndarray.cc | 10 +-- src/ndarray/ndarray_function.cc | 4 +- src/ndarray/ndarray_function.cu | 4 +- src/ndarray/ndarray_function.h | 2 +- src/operator/contrib/two_bit_quantize-inl.h | 75 ++++++++++++++++++++- tests/python/unittest/twobit.py | 26 +++++++ 8 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 tests/python/unittest/twobit.py diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 7e0a4663ed56..215707914d03 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -917,7 +917,7 @@ void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::st * \param compress type of compression * \param priority Priority of the action. */ -void Dequantize(const NDArray &from, NDArray *to, int original_size, +void Dequantize(const NDArray &from, NDArray *to, const float neg_threshold, const float pos_threshold, const std::string& compress, int priority); /*! diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index ad2f82175bfa..47e241e1a542 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -398,7 +398,7 @@ class KVStoreDistServer { if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); - Dequantize(recved, &stored, original_size, neg_threshold, pos_threshold, compress_, 0); + Dequantize(recved, &stored, neg_threshold, pos_threshold, compress_, 0); server->Response(req_meta); stored.WaitToRead(); } else if (sync_mode_) { @@ -408,16 +408,16 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - Dequantize(recved, &merged.array, original_size, neg_threshold, pos_threshold, compress_, 0); + Dequantize(recved, &merged.array, neg_threshold, pos_threshold, compress_, 0); } else { - Dequantize(recved, &decomp_buf, original_size, neg_threshold, pos_threshold, compress_, 0); + Dequantize(recved, &decomp_buf, neg_threshold, pos_threshold, compress_, 0); merged.array += decomp_buf; } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); } else { // async push - Dequantize(recved, &decomp_buf, original_size, neg_threshold, pos_threshold, compress_, 0); + Dequantize(recved, &decomp_buf, neg_threshold, pos_threshold, compress_, 0); exec_.Exec([this, key, &decomp_buf, &stored]() { CHECK(updater_); updater_(key, decomp_buf, &stored); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index df40ec048d14..3a8e295a89b6 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -609,7 +609,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { // } // } -void Dequantize(const NDArray &from, NDArray *to, int original_size, +void Dequantize(const NDArray &from, NDArray *to, const float neg_threshold, const float pos_threshold, const std::string& compress, int priority) { CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; @@ -617,9 +617,9 @@ void Dequantize(const NDArray &from, NDArray *to, int original_size, int b = to->ctx().dev_mask(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to, original_size, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, to, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs = {from.data(), to->data()}; - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, original_size, neg_threshold, pos_threshold); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); }, from.ctx(), {from.var()}, {to->var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { @@ -629,9 +629,9 @@ void Dequantize(const NDArray &from, NDArray *to, int original_size, #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to, original_size, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, to, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs = {from.data(), to->data()}; - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, original_size, neg_threshold, pos_threshold); + mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var()}, diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 19e5cb1962c4..74688d3aecbe 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -189,9 +189,9 @@ void ElementwiseSum(mshadow::Stream* s, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, original_size, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } /* diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 56dd90ca19d1..bd4ccf9d8711 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -207,9 +207,9 @@ void ElementwiseSum(mshadow::Stream* s, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, original_size, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } /* diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index a76a2f8b927e..0225f87fba5a 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -168,7 +168,7 @@ void Copy(const TBlob &from, TBlob *to, * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray */ template -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, int original_size, +void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold); /* diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index bc9b14c35e3d..c69a0ba238f9 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -204,6 +204,41 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, pos_threshold); // positive threshold } +template +void Quantize2BitImpl2(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { + // Init threshold and original size +// mxnet_op::Kernel::Launch(s, push_pskv_lens.size(), +// inputs[2].dptr(), // compressed array (concat for all servers) +// neg_threshold, pos_threshold, +// push_pskv_lens, pull_pskv_lens); + +// std::unordered_set meta_pos; +// std::vector cumulative_part_indices; +// int cur_pos = 0; +// int cum_index = 0; +// for(int i=0; i::Launch(s, inputs[2].Size(), // compressed array + inputs[0].Size(), +// meta_pos, cumulative_part_indices, +// push_pskv_lens, // compressed sizes +// pull_pskv_lens, // original sizes + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // input array + inputs[1].dptr(), // residual array + neg_threshold, // negative threshold + pos_threshold); // positive threshold +} + + // this function has been defined as quantize_2bit operator template void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, @@ -213,7 +248,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); -// Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); + Quantize2BitImpl2(s, inputs, param.neg_threshold, param.pos_threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -253,6 +288,39 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, return true; } +struct dequantize_2bit_all { + // Decompress + MSHADOW_XINLINE static void Map(int compr_block_id, + int original_size, + float *out, + float *in, + const float neg_threshold, + const float pos_threshold) { + + int out_start_id = compr_block_id<<4; + float* outval = out + out_start_id; + char* ch_ptr = reinterpret_cast(in + compr_block_id); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { + ch_ptr += !(i & 3); + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( masked == negmask ) { + *outval = neg_threshold; + } else { + *outval = 0; + } + } + } +}; + struct dequantize_2bit { // Decompress MSHADOW_XINLINE static void Map(int compr_block_id, @@ -306,9 +374,10 @@ struct dequantize_2bit { }; template -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, int original_size, +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data + int original_size = inputs[1].Size(); mxnet_op::Kernel::Launch(s, original_size/16, // original size original_size, inputs[1].dptr(), // out array @@ -324,7 +393,7 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); -// Dequantize2BitImpl(s, inputs); + Dequantize2BitImpl(s, inputs, 0.5, 0.5); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, diff --git a/tests/python/unittest/twobit.py b/tests/python/unittest/twobit.py new file mode 100644 index 000000000000..67541c6fe37a --- /dev/null +++ b/tests/python/unittest/twobit.py @@ -0,0 +1,26 @@ +from __future__ import print_function +import numpy as np +import mxnet as mx +import random +import itertools +from numpy.testing import assert_allclose, assert_array_equal +from mxnet.test_utils import * +import unittest +import timeit + +shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), + +# for shape in orig_shape: +grad = mx.nd.random_uniform(-0.9,0.9, shape=shape, ctx=default_context()) +residual = mx.nd.random_uniform(-0.6,0.6, shape=shape, ctx=default_context()) +res = mx.nd.array(residual) +mx.nd.waitall() + +def run(): + compr = mx.contrib.nd.create_2bit(grad) + decompr = mx.nd.array(grad.shape) + mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + mx.nd.waitall() +d = timeit.repeat(run, repeat=10, number=1) +print(d) \ No newline at end of file From 35b42f7036c8969c8d5ff41c8cda280e4b497d3c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 26 Oct 2017 18:01:58 -0700 Subject: [PATCH 150/237] separate mshadow and loop operators --- src/operator/contrib/two_bit_quantize-inl.h | 88 ++++++++++++++++++++- src/operator/contrib/two_bit_quantize.cc | 78 ++++++++++++++++++ 2 files changed, 164 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index c69a0ba238f9..1b18d46265f7 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -241,7 +241,7 @@ void Quantize2BitImpl2(mshadow::Stream* s, const std::vector& inputs // this function has been defined as quantize_2bit operator template -void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, +void Quantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -251,6 +251,49 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, Quantize2BitImpl2(s, inputs, param.neg_threshold, param.pos_threshold); } +template +void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + const TwoBitParam& param = nnvm::get(attrs.parsed); + float neg_threshold = param.neg_threshold; + float pos_threshold = param.pos_threshold; + int original_size = inputs[0].Size(); + float *out = inputs[2].dptr(); + float *grad = inputs[0].dptr(); + float *residual = inputs[1].dptr(); + for (int out_block_id=0; out_block_id (compr_block); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + + char *curr_byte = block_ptr; + for (int i = start; i < end && i < original_size; i++) { + // // adds 1 when i-start divisible by 4 + curr_byte += ((i - start) & 3); + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + residual[i] -= pos_threshold; + // set data to 11 + *curr_byte |= posbits[(i & 3)]; + } else if (residual[i] <= neg_threshold) { + residual[i] -= neg_threshold; + // set data to 10 + *curr_byte |= negbits[(i & 3)]; + } + } + } +} + inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { @@ -374,7 +417,7 @@ struct dequantize_2bit { }; template -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, +void Dequantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data int original_size = inputs[1].Size(); @@ -384,6 +427,37 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& input inputs[0].dptr(), // compressed array neg_threshold, // negative threshold pos_threshold); // positive threshold + +} + template + void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { + int original_size = inputs[1].Size(); + float* out = inputs[1].dptr(); + float* in = inputs[0].dptr(); + for (int compr_block_id=0; compr_block_id(in + compr_block_id); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { + ch_ptr += !(i & 3); + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( masked == negmask ) { + *outval = neg_threshold; + } else { + *outval = 0; + } + } + } } template @@ -396,6 +470,16 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, Dequantize2BitImpl(s, inputs, 0.5, 0.5); } +template +void Dequantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + Dequantize2BitImpl(s, inputs, 0.5, 0.5); +} + inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 6221aa77e871..a5225103cf7e 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -78,6 +78,55 @@ The residual is also updated to [1.0, -3.0, -1.0, -3.0]. .add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_arguments(TwoBitParam::__FIELDS__()); + NNVM_REGISTER_OP(_contrib_quantize_mshadow_2bit) + .describe(R"code(Quantize an input tensor into using 2bits for each value using +user-specified thresholds, while storing quantization error in residual array. + +The quantize_2bit operator takes 5 arguments and is called as follows: +`quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. +The operator modifies `residual` and `out` arrays. +The `out`variable will be the quantized array. Note that, `out` array can be generated by +invoking `create_2bit(array)`, avoiding calculation of size of quantized array. +This `out` array has first three elements as negative threshold, positive threshold, +and size of the original uncompressed array. Any elements after these three elements +represent quantized data. +The operation sums up array and residual, and then +applies the thresholds to quantize the data into one of three states +represented by 2bits. 16 such quantized floats in the original array +are packed together into one float in the `out` array. +The quantization error is stored in residual array. + +For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the +residual is [0.0, -2.0, 0, 1.0]. Let the negative and positive thresholds be +-4.0 and +4.0, respectively. In this method, the elements whose +(gradient + residual) >= pos_threshold will be quantized into 2-bits '01', +and the elements whose (gradient + residual) <= neg_threshold will be +quantized into 2-bits '10'. The other elements will be quantized +as '00'. Every 16 floats in the original array will be packed +into one float variable in the output array. + +In this example, 'out' has 4 elements. The first element stores the +neg_threshold (-4.0), the second element stores the pos_threshold (+4.0), the +third element stores the original size of the uncompressed array, and the +original array will be quantized into a single element in the last element. +The residual is also updated to [1.0, -3.0, -1.0, -3.0]. +)code" ADD_FILELINE) + .set_num_inputs(3) + .set_num_outputs(0) + .set_attr_parser(ParamParser) + .set_attr("FInferShape", Quantize2BitShape) + .set_attr("FInferType", Quantize2BitType) + .set_attr("FCompute", Quantize2BitComputeMShadow) + .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) + .set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + return std::vector{1, 2}; + }) + .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") + .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") + .add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") + .add_arguments(TwoBitParam::__FIELDS__()); + NNVM_REGISTER_OP(_contrib_create_2bit) .describe(R"code(Generate an array with the right shape to store the input data after two bit quantization. This array will be on the same context as input array. @@ -118,5 +167,34 @@ invoking dequantize_2bit(out, array), the 'array' argument will become }) .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + +NNVM_REGISTER_OP(_contrib_dequantize_mshadow_2bit) +.describe(R"code(Dequantize an input tensor quantized by quantize_2bit. + +The dequantize_2bit operator takes two input arguments. The first input is a NDArray, +which has been generated by quantize_2bit(). This operator expects the first +three elements to be the negative threshold, positive threshold, and the size +of the original uncompressed array. Starting from the fourth element are expected to +be quantized values of the original array. +The second input is a NDArray that has the same shape as the original +array before quantizing. The operator replaces the contents of this array +with dequantized data. + +In the example was described for quantize_2bit, +invoking dequantize_2bit(out, array), the 'array' argument will become +[4.0, 0, -4.0, 0], where -4.0 and 4.0 are the negative and positive thresholds. +)code" ADD_FILELINE) + .set_num_inputs(2) + .set_num_outputs(0) + .set_attr("FInferShape", Dequantize2BitShape) + .set_attr("FInferType", Dequantize2BitType) + .set_attr("FCompute", Dequantize2BitComputeMShadow) + .set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_2bit"}) + .set_attr("FMutateInputs", + [](const nnvm::NodeAttrs& attrs) { + return std::vector{1}; + }) + .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") + .add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); } // namespace op } // namespace mxnet From cabb948e36433395e93f41cbb8f343b1bcdadc36 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 07:56:58 +0000 Subject: [PATCH 151/237] working profiler, dequantize mshadow is slow --- src/operator/contrib/two_bit_quantize-inl.h | 41 +++++++++++++++---- src/operator/contrib/two_bit_quantize.cc | 2 +- tests/python/unittest/twobit.py | 19 +++++++-- twobit2.py | 44 +++++++++++++++++++++ 4 files changed, 94 insertions(+), 12 deletions(-) create mode 100644 twobit2.py diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 1b18d46265f7..ac69d7263077 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -87,8 +87,8 @@ inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, CHECK(!shape_is_none(in_attrs->at(0))); // output int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 3: - in_attrs->at(0).Size() / 16 + 4; + in_attrs->at(0).Size() / 16 : + in_attrs->at(0).Size() / 16 + 1; SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); return true; } @@ -205,7 +205,7 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, } template -void Quantize2BitImpl2(mshadow::Stream* s, const std::vector& inputs, +void Quantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { // Init threshold and original size // mxnet_op::Kernel::Launch(s, push_pskv_lens.size(), @@ -248,7 +248,7 @@ void Quantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl2(s, inputs, param.neg_threshold, param.pos_threshold); + Quantize2BitImplMShadow(s, inputs, param.neg_threshold, param.pos_threshold); } template @@ -306,8 +306,8 @@ inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, CHECK_EQ(in_attrs->at(0).Size(), in_attrs->at(1).Size()); int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 3: - in_attrs->at(0).Size() / 16 + 4; + in_attrs->at(0).Size() / 16 : + in_attrs->at(0).Size() / 16 + 1; CHECK_EQ(in_attrs->at(2).Size(), shape) << "The size of output array is not equal to " << "the size of compressed array"; @@ -467,7 +467,34 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); - Dequantize2BitImpl(s, inputs, 0.5, 0.5); +// Dequantize2BitImpl(s, inputs, 0.5, 0.5); +int original_size = inputs[1].Size(); + float* out = inputs[1].dptr(); + float* in = inputs[0].dptr(); + for (int compr_block_id=0; compr_block_id(in + compr_block_id); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { + ch_ptr += !(i & 3); + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = 0.5; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( masked == negmask ) { + *outval = -0.5; + } else { + *outval = 0; + } + } + } + } template diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index a5225103cf7e..1ac8ad5e7403 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -189,7 +189,7 @@ invoking dequantize_2bit(out, array), the 'array' argument will become .set_attr("FInferShape", Dequantize2BitShape) .set_attr("FInferType", Dequantize2BitType) .set_attr("FCompute", Dequantize2BitComputeMShadow) - .set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_2bit"}) + .set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_mshadow_2bit"}) .set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { return std::vector{1}; diff --git a/tests/python/unittest/twobit.py b/tests/python/unittest/twobit.py index 67541c6fe37a..ebcf7ff8a997 100644 --- a/tests/python/unittest/twobit.py +++ b/tests/python/unittest/twobit.py @@ -8,8 +8,8 @@ import unittest import timeit -shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), - +#shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), +shape = (256000000) # for shape in orig_shape: grad = mx.nd.random_uniform(-0.9,0.9, shape=shape, ctx=default_context()) residual = mx.nd.random_uniform(-0.6,0.6, shape=shape, ctx=default_context()) @@ -22,5 +22,16 @@ def run(): mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) mx.contrib.ndarray.dequantize_2bit(compr, decompr) mx.nd.waitall() -d = timeit.repeat(run, repeat=10, number=1) -print(d) \ No newline at end of file +#d = timeit.repeat(run, repeat=10, number=1) +#print(d) + + + +def run_mshadow(): + compr = mx.contrib.nd.create_2bit(grad) + decompr = mx.nd.array(grad.shape) + mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) + mx.contrib.ndarray.dequantize_mshadow_2bit(compr, decompr) + mx.nd.waitall() +d2 = timeit.repeat(run_mshadow, repeat=10, number=1) +print(d2) diff --git a/twobit2.py b/twobit2.py new file mode 100644 index 000000000000..b78126e6afef --- /dev/null +++ b/twobit2.py @@ -0,0 +1,44 @@ +from __future__ import print_function +import numpy as np +import mxnet as mx +import random +import itertools +from numpy.testing import assert_allclose, assert_array_equal +from mxnet.test_utils import * +import unittest +import timeit +mx.profiler.profiler_set_config(mode='all',filename='profiler.json') +#shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), +shape = (256000000) +# for shape in orig_shape: +grad = mx.nd.zeros(shape=shape, ctx=default_context()) +residual = mx.nd.zeros(shape=shape, ctx=default_context()) +res = mx.nd.array(residual) +compressed = mx.contrib.nd.create_2bit(grad) +mx.nd.waitall() + +def run(): + compr = mx.nd.zeros(compressed.shape) + #print(compr.asnumpy()) + decompr = mx.nd.zeros(grad.shape) + mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) + mx.contrib.ndarray.dequantize_2bit(compr, decompr) + mx.nd.waitall() +# print(decompr.asnumpy()) +mx.profiler.profiler_set_state('run') +d = timeit.repeat(run, repeat=10, number=1) +#mx.profiler.profiler_set_state('stop') +#print(d) + + + +def run_mshadow(): + #compr = mx.contrib.nd.create_2bit(grad) + compr = mx.nd.zeros(compressed.shape) + decompr = mx.nd.zeros(grad.shape) + mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) + mx.contrib.ndarray.dequantize_mshadow_2bit(compr, decompr) + mx.nd.waitall() +d2 = timeit.repeat(run_mshadow, repeat=10, number=1) +mx.profiler.profiler_set_state('stop') +print(d, d2) From b8d2b50b88ff17a8a5fe076fb751047e44172736 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 00:59:16 -0700 Subject: [PATCH 152/237] fix mshadow dequantize --- src/operator/contrib/two_bit_quantize-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index ac69d7263077..289a88495107 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -504,7 +504,7 @@ void Dequantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); - Dequantize2BitImpl(s, inputs, 0.5, 0.5); + Dequantize2BitImplMShadow(s, inputs, 0.5, 0.5); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, From e09a8fdaff8fe2324d91183a56931af141649462 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 01:03:08 -0700 Subject: [PATCH 153/237] fix quantize call by kvdist --- src/ndarray/ndarray_function.cc | 4 ++-- src/operator/contrib/two_bit_quantize-inl.h | 2 +- src/operator/contrib/two_bit_quantize.cc | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 74688d3aecbe..e42c523dbcff 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -191,7 +191,7 @@ void ElementwiseSum(mshadow::Stream* s, template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); } /* @@ -201,7 +201,7 @@ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 289a88495107..9b19fcef9684 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -170,7 +170,7 @@ struct quantize_2bit { }; template -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, +void Quantize2BitImplMShadowPskv(mshadow::Stream* s, const std::vector& inputs, ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { // Init threshold and original size diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 1ac8ad5e7403..0554ecd733aa 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -168,6 +168,8 @@ invoking dequantize_2bit(out, array), the 'array' argument will become .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); + + NNVM_REGISTER_OP(_contrib_dequantize_mshadow_2bit) .describe(R"code(Dequantize an input tensor quantized by quantize_2bit. From b2c9f290333eee01b625567cf83fb7b030b00700 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 01:05:01 -0700 Subject: [PATCH 154/237] making quantize all compatible as operators --- src/ndarray/ndarray_function.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index bd4ccf9d8711..1f2dfe070c22 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -209,7 +209,7 @@ void ElementwiseSum(mshadow::Stream* s, template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); } /* @@ -219,7 +219,7 @@ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); } } // namespace ndarray From 6e651ed066446a315575e859f4433989bc77bc27 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 08:28:18 +0000 Subject: [PATCH 155/237] add profile to measure.py --- tools/bandwidth/measure.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index d9c9fbe930a1..68650ea993a5 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -30,6 +30,7 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) +mx.profiler.profiler_set_config(mode='all',filename='profiler.json') def parse_args(): parser = argparse.ArgumentParser(description="command for benchmark kv-store") @@ -110,6 +111,7 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, Results = namedtuple('Results', ['iter', 'time', 'bandwidth', 'error']) res = [] + mx.profiler.profiler_set_state('run') for b in range(0, num_batches+1): tic = time.time() for i,g in enumerate(grads): @@ -141,6 +143,7 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, r.iter, r.time, r.bandwidth, r.error)) res.append(r) toc = 0 + mx.profiler.profiler_set_state('stop') return res if __name__ == "__main__": From 0c48ebbe395000677730801f7f578a0b4e3aa0a8 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 09:40:41 +0000 Subject: [PATCH 156/237] minor profiler changes --- example/image-classification/train_imagenet.py | 2 -- tools/bandwidth/measure.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py index f465fbc5f469..3d18fc5cc4fc 100644 --- a/example/image-classification/train_imagenet.py +++ b/example/image-classification/train_imagenet.py @@ -22,7 +22,6 @@ from common import find_mxnet, data, fit from common.util import download_file import mxnet as mx - if __name__ == '__main__': # parse args parser = argparse.ArgumentParser(description="train imagenet-1k", @@ -53,6 +52,5 @@ from importlib import import_module net = import_module('symbols.'+args.network) sym = net.get_symbol(**vars(args)) - # train fit.fit(args, sym, data.get_rec_iter) diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index 68650ea993a5..d9c9fbe930a1 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -30,7 +30,6 @@ logger = logging.getLogger() logger.setLevel(logging.INFO) -mx.profiler.profiler_set_config(mode='all',filename='profiler.json') def parse_args(): parser = argparse.ArgumentParser(description="command for benchmark kv-store") @@ -111,7 +110,6 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, Results = namedtuple('Results', ['iter', 'time', 'bandwidth', 'error']) res = [] - mx.profiler.profiler_set_state('run') for b in range(0, num_batches+1): tic = time.time() for i,g in enumerate(grads): @@ -143,7 +141,6 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, r.iter, r.time, r.bandwidth, r.error)) res.append(r) toc = 0 - mx.profiler.profiler_set_state('stop') return res if __name__ == "__main__": From fe66ef98119e7284c43fee05bfef05710c3251b9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 09:59:19 +0000 Subject: [PATCH 157/237] timing print in cpp operator --- src/operator/contrib/two_bit_quantize-inl.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 9b19fcef9684..a93beaffc572 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -23,7 +23,7 @@ */ #ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ #define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ - +#include #include #include #include @@ -248,7 +248,11 @@ void Quantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImplMShadow(s, inputs, param.neg_threshold, param.pos_threshold); +std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); +Quantize2BitImplMShadow(s, inputs, param.neg_threshold, param.pos_threshold); +std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); +auto dur = std::chrono::duration_cast(t2-t1).count(); +std::cout << "quant "< @@ -420,6 +424,9 @@ template void Dequantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data + +std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); + int original_size = inputs[1].Size(); mxnet_op::Kernel::Launch(s, original_size/16, // original size original_size, @@ -427,7 +434,9 @@ void Dequantize2BitImplMShadow(mshadow::Stream* s, const std::vector inputs[0].dptr(), // compressed array neg_threshold, // negative threshold pos_threshold); // positive threshold - +std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); +auto dur = std::chrono::duration_cast(t2-t1).count(); +std::cout<<"deq "< void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, From f5204ca8fad889f24cb1f4218465b4c6c5774856 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 03:07:15 -0700 Subject: [PATCH 158/237] time quantize --- src/operator/contrib/two_bit_quantize-inl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index a93beaffc572..613ff9b90373 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -41,7 +41,6 @@ template int sgn(T val) { return (T(0) < val) - (val < T(0)); } - struct init_mem_2bit { // Initialize output array MSHADOW_XINLINE static void Map(int i, float* out) { @@ -192,6 +191,9 @@ void Quantize2BitImplMShadowPskv(mshadow::Stream* s, const std::vector::Launch(s, inputs[2].Size(), // compressed array inputs[0].Size(), // meta_pos, cumulative_part_indices, @@ -202,6 +204,11 @@ void Quantize2BitImplMShadowPskv(mshadow::Stream* s, const std::vector(), // residual array neg_threshold, // negative threshold pos_threshold); // positive threshold + + std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); + auto dur = std::chrono::duration_cast(t2-t1).count(); + std::cout << "quant "< From 5e473b288c65d4103aa24eec67d9786d80f62255 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 20:19:30 +0000 Subject: [PATCH 159/237] saving data feature added --- src/operator/contrib/two.rahul | 340 ++++++++++++++++++++ src/operator/contrib/two_bit_quantize-inl.h | 15 +- twobit2.py | 18 +- 3 files changed, 363 insertions(+), 10 deletions(-) create mode 100644 src/operator/contrib/two.rahul diff --git a/src/operator/contrib/two.rahul b/src/operator/contrib/two.rahul new file mode 100644 index 000000000000..42640fd30af6 --- /dev/null +++ b/src/operator/contrib/two.rahul @@ -0,0 +1,340 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + /*! + * \file two_bit_quantize-inl.h + * \brief implementation of quantize_2bit operation + */ +#ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ +#define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ + +#include +#include +#include +#include "../elemwise_op_common.h" +#include "../mshadow_op.h" +#include "../mxnet_op.h" + +namespace mxnet { +namespace op { + +struct init_mem_2bit { + // Initialize output array + MSHADOW_XINLINE static void Map(int i, float* out) { + *(out+i) = 0; + } +}; + +struct TwoBitParam : public dmlc::Parameter { + float pos_threshold, neg_threshold; + DMLC_DECLARE_PARAMETER(TwoBitParam) { + DMLC_DECLARE_FIELD(neg_threshold) + .set_default(-0.1) + .describe("Threshold to quantize negative values. " + "Has to be less than 0"); + DMLC_DECLARE_FIELD(pos_threshold) + .set_default(0.1) + .describe("Threshold to quantize positive values. " + "Has to be greater than 0"); + } +}; + +template +void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + // For now, this method can only compress the float data + mshadow::Stream *s = ctx.get_stream(); + // Init the memory of output to 0x00000000 + mxnet_op::Kernel::Launch(s, outputs[0].Size(), + outputs[0].dptr()); // compressed array +} + +inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + CHECK_EQ(in_attrs->size(), 1U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK(!shape_is_none(in_attrs->at(0))); + // output + int shape = in_attrs->at(0).Size() % 16 == 0 ? + in_attrs->at(0).Size() / 16 + 3: + in_attrs->at(0).Size() / 16 + 4; + SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); + return true; +} + +inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + CHECK_EQ(in_attrs->size(), 1U); + // 0. output array + CHECK_EQ(out_attrs->size(), 1U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`create_2bit_` only supports float32 input for now"; + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); + return true; +} + +struct init_threshold_2bit { + MSHADOW_XINLINE static void Map(int i, + float *out, + const float neg_threshold, + const float pos_threshold, + int size) { + // The first two elements in output are thresholds + // The third element is the original size of the array + out[0] = neg_threshold; + out[1] = pos_threshold; + // TODO(huilgolr) check potential problem here? + out[2] = static_cast(size); + } +}; + +struct quantize_2bit { + MSHADOW_XINLINE static void Map(int block_id, + int grad_size, + float *out, + float *grad, + float *residual, + const float neg_threshold, + const float pos_threshold) { + float* compr_block = out + block_id; + // init to 0 + *compr_block = 0; + // start and end are indices in original grad array + int start = block_id*16; + int end = (start+16 <= grad_size) ? start+16 : grad_size; + char* block_ptr = reinterpret_cast < char* > (compr_block); + for (int i=start; i < end; i++){ + char* curr_byte = block_ptr + (i-start)/4; + float curr_value = grad[i] + residual[i]; + if (curr_value >= pos_threshold) { + residual[i] = curr_value - pos_threshold; + // set data to 10 + (*curr_byte) |= (2u << (6-((i%4)*2))); + } else if (curr_value <= neg_threshold) { + residual[i] = curr_value - neg_threshold; + // set data to 01 + (*curr_byte) |= (1u << (6-((i%4)*2))); + } else { + // leave data as 00 + residual[i] = curr_value; + } + } + } +}; + +template +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float neg_threshold, const float pos_threshold) { + // Init threshold and original size + mxnet_op::Kernel::Launch(s, 1, + inputs[2].dptr(), // compressed array + neg_threshold, pos_threshold, + inputs[0].Size()); + // Finally, compress the data and calculate new residual + mxnet_op::Kernel::Launch(s, inputs[2].Size()-3, + inputs[0].Size(), // original grad size + inputs[2].dptr()+3, // compressed array + inputs[0].dptr(), // input array + inputs[1].dptr(), // residual array + neg_threshold, // negative threshold + pos_threshold); // positive threshold +} + +// this function has been defined as quantize_2bit operator +template +void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + const TwoBitParam& param = nnvm::get(attrs.parsed); + Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); +} + +inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + // 1. residual array + // 2. compressed array + CHECK_EQ(in_attrs->size(), 3U); + CHECK(!shape_is_none(in_attrs->at(0))); + CHECK(!shape_is_none(in_attrs->at(1))); + CHECK_EQ(in_attrs->at(0).Size(), + in_attrs->at(1).Size()); + int shape = in_attrs->at(0).Size() % 16 == 0 ? + in_attrs->at(0).Size() / 16 + 3: + in_attrs->at(0).Size() / 16 + 4; + CHECK_EQ(in_attrs->at(2).Size(), shape) + << "The size of output array is not equal to " + << "the size of compressed array"; + return true; +} + +inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. input array + // 1. residual array + // 2. compressed array + CHECK_EQ(in_attrs->size(), 3U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`quantize_2bit_` only supports float32 input for now"; + CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) + << "`quantize_2bit_` only supports float32 input for now"; + CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) + << "`quantize_2bit_` only supports float32 input for now"; + return true; +} + +struct dequantize_2bit { + // Decompress + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + float *neg_threshold, + float *pos_threshold) { + // get block ptr + int block_id = i / 16; + char* ch_ptr = reinterpret_cast(in+block_id); + // get row ptr + int row_id = (i%16)/4; + ch_ptr += row_id; + // get column id + int col_id = (i%16)%4; + // Decompress + switch (col_id) { + case 0: + // positve + if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 1: + // positve + if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 2: + // positve + if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + case 3: + // positve + if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 + out[i] = *pos_threshold; + // negative + } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 + out[i] = *neg_threshold; + } else { // 0 + out[i] = 0; + } + break; + default: + break; + } + } +}; + +template +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { + // Can only decompress the float32 data + mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr()+3, // compressed array + inputs[0].dptr(), // negative threshold + inputs[0].dptr()+1); // positive threshold +} + +template +void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + mshadow::Stream *s = ctx.get_stream(); + Dequantize2BitImpl(s, inputs); +} + +inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. compressed array + // 1. original array + CHECK_EQ(in_attrs->size(), 2U); + // No output + CHECK_EQ(out_attrs->size(), 0U); + // check input + CHECK(!shape_is_none(in_attrs->at(0))); + CHECK(!shape_is_none(in_attrs->at(1))); + // TODO(huilgolr) check + CHECK_LE(in_attrs->at(1).Size(), + in_attrs->at(0).Size()*16) + << "The shape of the second input array are " + << "not equal to the original array."; + return true; +} + +inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + // 0. compressed array + // 1. original array + CHECK_EQ(in_attrs->size(), 2U); + // check input + CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) + << "`dequantize_2bit_` only supports float32 input for now"; + CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) + << "`dequantize_2bit_` only supports float32 input for now"; + return true; +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 613ff9b90373..450d68fd107f 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -33,6 +33,7 @@ #include #include "ps/ps.h" +#include namespace mxnet { namespace op { @@ -207,7 +208,17 @@ void Quantize2BitImplMShadowPskv(mshadow::Stream* s, const std::vector(t2-t1).count(); - std::cout << "quant "<1000) { + NDArray* n = new NDArray(inputs[0], 0); + { + std::unique_ptr fo(dmlc::Stream::Create("quant_data", "w")); + mxnet::NDArray::Save(fo.get(), {*n},{}); + } + + } + } @@ -443,7 +454,7 @@ std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution pos_threshold); // positive threshold std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); auto dur = std::chrono::duration_cast(t2-t1).count(); -std::cout<<"deq "< void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, diff --git a/twobit2.py b/twobit2.py index b78126e6afef..c161ed365cc3 100644 --- a/twobit2.py +++ b/twobit2.py @@ -9,14 +9,16 @@ import timeit mx.profiler.profiler_set_config(mode='all',filename='profiler.json') #shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), -shape = (256000000) +#shape = (256000000) # for shape in orig_shape: -grad = mx.nd.zeros(shape=shape, ctx=default_context()) -residual = mx.nd.zeros(shape=shape, ctx=default_context()) +grad = mx.nd.load('example/image-classification/quant_data')[0] +print(grad.shape) +#grad = mx.nd.random_uniform(-2,2,shape=shape, ctx=default_context()) +residual = mx.nd.random_uniform(-0.4,0.4,shape=grad.shape, ctx=default_context()) res = mx.nd.array(residual) compressed = mx.contrib.nd.create_2bit(grad) -mx.nd.waitall() - +#grad.save('grad') +#residual.save('residual') def run(): compr = mx.nd.zeros(compressed.shape) #print(compr.asnumpy()) @@ -26,7 +28,7 @@ def run(): mx.nd.waitall() # print(decompr.asnumpy()) mx.profiler.profiler_set_state('run') -d = timeit.repeat(run, repeat=10, number=1) +#d = timeit.repeat(run, repeat=10, number=1) #mx.profiler.profiler_set_state('stop') #print(d) @@ -39,6 +41,6 @@ def run_mshadow(): mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) mx.contrib.ndarray.dequantize_mshadow_2bit(compr, decompr) mx.nd.waitall() -d2 = timeit.repeat(run_mshadow, repeat=10, number=1) +d2 = timeit.repeat(run_mshadow, repeat=25, number=1) mx.profiler.profiler_set_state('stop') -print(d, d2) +print(d2) From 88cc0fdee6d5cd053c843c71f93226ad238df343 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 20:52:31 +0000 Subject: [PATCH 160/237] cleanup test --- twobit2.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/twobit2.py b/twobit2.py index c161ed365cc3..95db15684cb9 100644 --- a/twobit2.py +++ b/twobit2.py @@ -9,33 +9,26 @@ import timeit mx.profiler.profiler_set_config(mode='all',filename='profiler.json') #shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), -#shape = (256000000) -# for shape in orig_shape: grad = mx.nd.load('example/image-classification/quant_data')[0] print(grad.shape) #grad = mx.nd.random_uniform(-2,2,shape=shape, ctx=default_context()) residual = mx.nd.random_uniform(-0.4,0.4,shape=grad.shape, ctx=default_context()) res = mx.nd.array(residual) compressed = mx.contrib.nd.create_2bit(grad) -#grad.save('grad') -#residual.save('residual') def run(): compr = mx.nd.zeros(compressed.shape) - #print(compr.asnumpy()) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) mx.contrib.ndarray.dequantize_2bit(compr, decompr) mx.nd.waitall() -# print(decompr.asnumpy()) mx.profiler.profiler_set_state('run') -#d = timeit.repeat(run, repeat=10, number=1) +d = timeit.repeat(run, repeat=10, number=1) #mx.profiler.profiler_set_state('stop') -#print(d) +print(d) def run_mshadow(): - #compr = mx.contrib.nd.create_2bit(grad) compr = mx.nd.zeros(compressed.shape) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) @@ -43,4 +36,4 @@ def run_mshadow(): mx.nd.waitall() d2 = timeit.repeat(run_mshadow, repeat=25, number=1) mx.profiler.profiler_set_state('stop') -print(d2) +# print( d2) From 5c7a1ff3a8899348a36c972a3a4980417ccfe2f9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 28 Oct 2017 01:07:33 +0000 Subject: [PATCH 161/237] small updates --- omp_test.cpp | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ twobit2.py | 14 ++++++------- 2 files changed, 65 insertions(+), 7 deletions(-) create mode 100644 omp_test.cpp diff --git a/omp_test.cpp b/omp_test.cpp new file mode 100644 index 000000000000..387bd1a6bfb5 --- /dev/null +++ b/omp_test.cpp @@ -0,0 +1,58 @@ +#include +#include +#include "omp.h" +#include + +void quantize_2bit(float* data, float* res, float* compr, long long int size){ + #pragma omp parallel for + for(long long int i=0; i(compr_block); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + char* curr_byte = block_ptr; + for(int i=s; i= 0.5) { + res[i] -= 0.5; + *curr_byte |= posbits[i&3]; + } + else if(res[i] <= -0.5) { + res[i] += 0.5; + *curr_byte |= negbits[i&3]; + } + } + } +} + +int main() { + std::cout<<"openmp max threads are "<(t2-t1).count(); + std::cout << "time for " <(t2-t1).count(); + std::cout<< "time for quantizing "< Date: Fri, 27 Oct 2017 18:32:49 -0700 Subject: [PATCH 162/237] cleanup --- src/kvstore/kvstore_dist.h | 57 +---- src/ndarray/ndarray_function.cc | 3 +- src/ndarray/ndarray_function.cu | 3 +- src/ndarray/ndarray_function.h | 1 - src/operator/contrib/two_bit_quantize-inl.h | 267 +++----------------- src/operator/contrib/two_bit_quantize.cu | 6 + tests/cpp/operator/quantize_perf.cc | 108 ++++++++ tests/python/unittest/test_operator.py | 2 +- tests/python/unittest/twobit.py | 37 --- twobit2.py | 39 --- 10 files changed, 162 insertions(+), 361 deletions(-) create mode 100644 tests/cpp/operator/quantize_perf.cc delete mode 100644 tests/python/unittest/twobit.py delete mode 100644 twobit2.py diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index a15b2b71c721..64050339270a 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -315,12 +315,7 @@ class KVStoreDist : public KVStoreLocal { } if (compress_ == "2bit") { - mu_.lock(); - PSKV& pull_pskv = pull_ps_kv_[key]; - mu_.unlock(); - -// Compress(comm_buf, &small_buf, &res_buf, pskv, priority); - QuantizeAll(comm_buf, &small_buf, &res_buf, pskv.lens, pull_pskv.lens, + QuantizeAll(comm_buf, &small_buf, &res_buf, compress_, neg_threshold_, pos_threshold_, priority); } else { LOG(FATAL) << "Unsupported quantization"; @@ -509,49 +504,7 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistCompressedPush")); } -// /* -// * \brief Compresses data by dividing original data into a part for each server, then -// * quantizing each of these data blocks. The sizes of these parts come from pskv. -// */ -// void Compress(const NDArray& comm_buf, NDArray* small_buf, NDArray* res_buf, -// const PSKV& pskv, int priority) { -// size_t orig_size = comm_buf.shape().Size(); -// // to allow indexing parts for each server -// NDArray flattened_comm_buf = comm_buf.Reshape(TShape{(int64_t) orig_size}); -// -// if (compress_ == "2bit") { -// // should be start of data in original commbuf -// size_t cur_from = 0; -// // should be start of meta in new small_buf -// size_t cur_to = 0; -// for (size_t i = 0; i < pskv.keys.size(); i++) { -// NDArray part_compr = small_buf->Slice(cur_to, cur_to + pskv.lens[i]); -// -// // removing the 3 values from pskv length which are meta data -// // end_part_data represents end of original data for this part -// size_t end_part_data = cur_from + (pskv.lens[i] - 3) * 16; -// // don't exceed original size -// if (end_part_data > orig_size) { -// end_part_data = orig_size; -// } -// NDArray part_data = flattened_comm_buf.Slice(cur_from, end_part_data); -// NDArray part_res = res_buf->Slice(cur_from, end_part_data); -// -// Quantize(part_data, &part_compr, &part_res, compress_, -// neg_threshold_, pos_threshold_, priority); -// -// cur_from = end_part_data; -// cur_to = cur_to + pskv.lens[i]; -// } -// CHECK_EQ(cur_from, orig_size); -// CHECK_EQ(cur_to, small_buf->shape().Size()); -// } else { -// LOG(FATAL) << "Unsupported compression type"; -// } -// } - void QuantizeAll(const NDArray &from, NDArray *to, NDArray *residual, - ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const std::string& compress, const float neg_threshold, const float pos_threshold, int priority) { CHECK(from.shape().ndim() != 0) @@ -561,10 +514,9 @@ class KVStoreDist : public KVStoreLocal { int b = to->ctx().dev_mask(); if (a == cpu::kDevMask && b == cpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); @@ -575,10 +527,9 @@ class KVStoreDist : public KVStoreLocal { #if MXNET_USE_CUDA if (a == gpu::kDevMask && b == gpu::kDevMask) { if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, pull_pskv_lens, push_pskv_lens, neg_threshold, pos_threshold](RunContext ctx) { + Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); @@ -596,8 +547,6 @@ class KVStoreDist : public KVStoreLocal { } } - - PSKV& EncodeKey(int key, size_t size, bool is_push) { if (compress_ != "none") { return EncodeCompressedKey(key, size, is_push); diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index e42c523dbcff..e47acc6c0735 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -199,9 +199,8 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 1f2dfe070c22..48407312fd2a 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -217,9 +217,8 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i */ template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, push_pskv_lens, pull_pskv_lens, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index 0225f87fba5a..be6eae81cd4a 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -176,7 +176,6 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i */ template void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, const float neg_threshold, const float pos_threshold); template diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 450d68fd107f..56e5b0b32e26 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -31,17 +31,12 @@ #include "../mshadow_op.h" #include "../mxnet_op.h" #include +#include #include "ps/ps.h" -#include namespace mxnet { namespace op { -// branchless -template int sgn(T val) { - return (T(0) < val) - (val < T(0)); -} - struct init_mem_2bit { // Initialize output array MSHADOW_XINLINE static void Map(int i, float* out) { @@ -74,6 +69,7 @@ void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, // Init the memory of output to 0x00000000 mxnet_op::Kernel::Launch(s, outputs[0].Size(), outputs[0].dptr()); // compressed array + } inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, @@ -107,34 +103,8 @@ inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, return true; } -//struct init_threshold_2bit { -// MSHADOW_XINLINE static void Map(int server_id, -// float *out, -// const float neg_threshold, -// const float pos_threshold, -// ps::SArray compr_sizes, -// ps::SArray orig_sizes) { -// // i for each server -// size_t curr_pos = 0; -// for (int i=0; i(orig_sizes[server_id]); -// } -//}; - struct quantize_2bit { MSHADOW_XINLINE static void Map(int out_block_id, -// std::unordered_set meta_pos, -// std::vector cumulative_part_indices, -// ps::SArray compr_sizes, -// ps::SArray orig_sizes, int original_size, float *out, float *grad, @@ -169,93 +139,29 @@ struct quantize_2bit { } }; -template -void Quantize2BitImplMShadowPskv(mshadow::Stream* s, const std::vector& inputs, - ps::SArray push_pskv_lens, ps::SArray pull_pskv_lens, - const float neg_threshold, const float pos_threshold) { - // Init threshold and original size -// mxnet_op::Kernel::Launch(s, push_pskv_lens.size(), -// inputs[2].dptr(), // compressed array (concat for all servers) -// neg_threshold, pos_threshold, -// push_pskv_lens, pull_pskv_lens); - -// std::unordered_set meta_pos; -// std::vector cumulative_part_indices; -// int cur_pos = 0; -// int cum_index = 0; -// for(int i=0; i::Launch(s, inputs[2].Size(), // compressed array - inputs[0].Size(), -// meta_pos, cumulative_part_indices, -// push_pskv_lens, // compressed sizes -// pull_pskv_lens, // original sizes - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // input array - inputs[1].dptr(), // residual array - neg_threshold, // negative threshold - pos_threshold); // positive threshold - - std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration_cast(t2-t1).count(); - std::cout << "quant for "<1000) { - NDArray* n = new NDArray(inputs[0], 0); - { - std::unique_ptr fo(dmlc::Stream::Create("quant_data", "w")); - mxnet::NDArray::Save(fo.get(), {*n},{}); - } - - } - - -} - template void Quantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - // Init threshold and original size -// mxnet_op::Kernel::Launch(s, push_pskv_lens.size(), -// inputs[2].dptr(), // compressed array (concat for all servers) -// neg_threshold, pos_threshold, -// push_pskv_lens, pull_pskv_lens); - -// std::unordered_set meta_pos; -// std::vector cumulative_part_indices; -// int cur_pos = 0; -// int cum_index = 0; -// for(int i=0; i::Launch(s, inputs[2].Size(), // compressed array inputs[0].Size(), -// meta_pos, cumulative_part_indices, -// push_pskv_lens, // compressed sizes -// pull_pskv_lens, // original sizes inputs[2].dptr(), // compressed array inputs[0].dptr(), // input array inputs[1].dptr(), // residual array neg_threshold, // negative threshold pos_threshold); // positive threshold -} + std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); + auto dur = std::chrono::duration_cast(t2 - t1).count(); + std::cout << "quantizing " << inputs[0].Size() << " took " << dur << " ms" << std::endl; + if (dur > 1000) { + NDArray *n = new NDArray(inputs[0], 0); + std::unique_ptr fo(dmlc::Stream::Create("quant_data", "w")); + mxnet::NDArray::Save(fo.get(), {*n}, {}); + } +} // this function has been defined as quantize_2bit operator template @@ -266,11 +172,7 @@ void Quantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); -std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); -Quantize2BitImplMShadow(s, inputs, param.neg_threshold, param.pos_threshold); -std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); -auto dur = std::chrono::duration_cast(t2-t1).count(); -std::cout << "quant "<(s, inputs, param.neg_threshold, param.pos_threshold); } template @@ -353,39 +255,6 @@ inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, return true; } -struct dequantize_2bit_all { - // Decompress - MSHADOW_XINLINE static void Map(int compr_block_id, - int original_size, - float *out, - float *in, - const float neg_threshold, - const float pos_threshold) { - - int out_start_id = compr_block_id<<4; - float* outval = out + out_start_id; - char* ch_ptr = reinterpret_cast(in + compr_block_id); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - ch_ptr += !(i & 3); - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } - } -}; - struct dequantize_2bit { // Decompress MSHADOW_XINLINE static void Map(int compr_block_id, @@ -416,25 +285,6 @@ struct dequantize_2bit { *outval = 0; } } - - - // get row ptr -// char* ch_ptr = (reinterpret_cast(in + (i >> 4))); -// for (int i=0 ) -// + ((i & 15) >> 2); -// const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; -// const int negbits[] = {0x80, 0x20, 0x08, 0x02}; -// -// int col = (i & 15) & 3; -// if ( ((*ch_ptr) & posbits[col]) == posbits[col] ) { -// out[i] = pos_threshold; -// } // use posbits for mask as posbits are 11 -// // compare with negbits -// else if ( ((*ch_ptr) & posbits[col]) == negbits[col] ) { -// out[i] = neg_threshold; -// } else { -// out[i] = 0; -// } } }; @@ -443,8 +293,7 @@ void Dequantize2BitImplMShadow(mshadow::Stream* s, const std::vector const float neg_threshold, const float pos_threshold) { // Can only decompress the float32 data -std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); - + std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); int original_size = inputs[1].Size(); mxnet_op::Kernel::Launch(s, original_size/16, // original size original_size, @@ -452,39 +301,9 @@ std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution inputs[0].dptr(), // compressed array neg_threshold, // negative threshold pos_threshold); // positive threshold -std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); -auto dur = std::chrono::duration_cast(t2-t1).count(); -std::cout<<"deq "< - void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - int original_size = inputs[1].Size(); - float* out = inputs[1].dptr(); - float* in = inputs[0].dptr(); - for (int compr_block_id=0; compr_block_id(in + compr_block_id); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - ch_ptr += !(i & 3); - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } - } + std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); + auto dur = std::chrono::duration_cast(t2-t1).count(); + std::cout<<"dequantizing "< @@ -494,34 +313,32 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); -// Dequantize2BitImpl(s, inputs, 0.5, 0.5); -int original_size = inputs[1].Size(); - float* out = inputs[1].dptr(); - float* in = inputs[0].dptr(); - for (int compr_block_id=0; compr_block_id(in + compr_block_id); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - ch_ptr += !(i & 3); - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = 0.5; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = -0.5; - } else { - *outval = 0; - } + int original_size = inputs[1].Size(); + float* out = inputs[1].dptr(); + float* in = inputs[0].dptr(); + for (int compr_block_id=0; compr_block_id(in + compr_block_id); + const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { + ch_ptr += !(i & 3); + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = 0.5; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( masked == negmask ) { + *outval = -0.5; + } else { + *outval = 0; } } - + } } template diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu index b90ff1840771..6482b16e1b37 100644 --- a/src/operator/contrib/two_bit_quantize.cu +++ b/src/operator/contrib/two_bit_quantize.cu @@ -33,6 +33,12 @@ NNVM_REGISTER_OP(_contrib_quantize_2bit) NNVM_REGISTER_OP(_contrib_dequantize_2bit) .set_attr("FCompute", Dequantize2BitCompute); +NNVM_REGISTER_OP(_contrib_quantize_mshadow_2bit) +.set_attr("FCompute", Quantize2BitComputeMShadow); + +NNVM_REGISTER_OP(_contrib_dequantize_mshadow_2bit) +.set_attr("FCompute", Dequantize2BitComputeMShadow); + NNVM_REGISTER_OP(_contrib_create_2bit) .set_attr("FCompute", Create2BitArrayCompute); diff --git a/tests/cpp/operator/quantize_perf.cc b/tests/cpp/operator/quantize_perf.cc new file mode 100644 index 000000000000..d6de53cee5bf --- /dev/null +++ b/tests/cpp/operator/quantize_perf.cc @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include +#include "../../src/operator/activation-inl.h" +#include "../include/test_op_runner.h" +#include "../include/test_core_op.h" + +using namespace mxnet; + +using kwargs_t = test::op::kwargs_t; + +template +static void RunCoreOpBidirectional(const bool isGPU, + const kwargs_t& op_kwargs, + const char *op_name, + const char *backward_op_name = "") { + const TShape shape({5, 5}); + test::op::CoreOpExecutor op(isGPU, shape); + op.set_verbose(false); + + op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name)); + + PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + op.Execute(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); + if(op.HasBackward()) { + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + op.ExecuteBackward(); + PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); + } +} + + +template +static void RunCoreOpTimingTest(const bool isGPU, + const kwargs_t& op_kwargs, + const char *op_name, + const char *backward_op_name = "") { + const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( + op_kwargs, op_name, backward_op_name); + + // prime code and cache before the performance runs + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, {20, 3, 128, 128}, kwargs, 1); + + // Do the performance runs + std::vector shapes; + if (test::performance_run) { + shapes = { + {1, 1, 28, 28}, + {1, 3, 28, 28}, + {50, 1, 18, 32}, + {50, 3, 18, 32}, + {20, 3, 128, 128} + }; + } else { + shapes = { + {1, 1, 28, 28}, + {50, 3, 18, 32}, + }; + } + const char *pu = isGPU ? "GPU" : "CPU"; + for (const TShape &shape : shapes) { + runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, shape); + } +} + +///*! +// * \brief Generic bidirectional sanity test +// */ +//TEST(OMP_TUNING, ExecuteBidirectional) { +// RunCoreOpBidirectional(false, {}, "elemwise_add", "_backward_add"); +//} + +/*! + * \brief ActivationOp timing test for CPU + */ +TEST(OMP_TUNING, TimingCPU) { + RunCoreOpTimingTest(false, {}, "quantize_2bt", COREOP_BWD_OP_NAME_VALUE_NONE); +} + +#if MXNET_USE_CUDA == 1 +/*! + * \brief ActivationOp timing test for GPU + */ +TEST(OMP_TUNING, TimingGPU) { + RunCoreOpTimingTest(true, {}, "elemwise_add", "_backward_add"); +} +#endif // MXNET_USE_CUDA == 1 diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 91826d54efd6..3d5f6d723e8f 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4311,7 +4311,7 @@ def check(data, idx): def test_two_bit_quantization(): neg_threshold = -0.5 pos_threshold = 0.5 - orig_shape = [(25,),(16,),(1121),(14400)] + orig_shape = [(25,),(16,),(1121),(14400),(144000),(144000)] num_repeat = 1 from struct import pack,unpack diff --git a/tests/python/unittest/twobit.py b/tests/python/unittest/twobit.py deleted file mode 100644 index ebcf7ff8a997..000000000000 --- a/tests/python/unittest/twobit.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import print_function -import numpy as np -import mxnet as mx -import random -import itertools -from numpy.testing import assert_allclose, assert_array_equal -from mxnet.test_utils import * -import unittest -import timeit - -#shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), -shape = (256000000) -# for shape in orig_shape: -grad = mx.nd.random_uniform(-0.9,0.9, shape=shape, ctx=default_context()) -residual = mx.nd.random_uniform(-0.6,0.6, shape=shape, ctx=default_context()) -res = mx.nd.array(residual) -mx.nd.waitall() - -def run(): - compr = mx.contrib.nd.create_2bit(grad) - decompr = mx.nd.array(grad.shape) - mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - mx.nd.waitall() -#d = timeit.repeat(run, repeat=10, number=1) -#print(d) - - - -def run_mshadow(): - compr = mx.contrib.nd.create_2bit(grad) - decompr = mx.nd.array(grad.shape) - mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) - mx.contrib.ndarray.dequantize_mshadow_2bit(compr, decompr) - mx.nd.waitall() -d2 = timeit.repeat(run_mshadow, repeat=10, number=1) -print(d2) diff --git a/twobit2.py b/twobit2.py deleted file mode 100644 index 62d09bc32f0c..000000000000 --- a/twobit2.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import print_function -import numpy as np -import mxnet as mx -import random -import itertools -from numpy.testing import assert_allclose, assert_array_equal -from mxnet.test_utils import * -import unittest -import timeit -#shape = [(268435456)] #(25,),(16,),(1121),(14400),(144000), -grad = mx.nd.load('example/image-classification/quant_data')[0] -print(grad.shape) -#grad = mx.nd.random_uniform(-2,2,shape=shape, ctx=default_context()) -residual = mx.nd.random_uniform(-0.4,0.4,shape=grad.shape, ctx=default_context()) -res = mx.nd.array(residual) -compressed = mx.contrib.nd.create_2bit(grad) - -def run(): - compr = mx.nd.zeros(compressed.shape) - decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.quantize_2bit(grad, res, compr, -0.5, 0.5) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) - mx.nd.waitall() -#priming -run() - -#d = timeit.repeat(run, repeat=10, number=1) -#print(d) - - - -def run_mshadow(): - compr = mx.nd.zeros(compressed.shape) - decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.quantize_mshadow_2bit(grad, res, compr, -0.5, 0.5) - mx.contrib.ndarray.dequantize_mshadow_2bit(compr, decompr) - mx.nd.waitall() -d2 = timeit.repeat(run_mshadow, repeat=25, number=1) -print( d2) From 5294d4dc18974feb7f753e19c456e467d5cc193a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 27 Oct 2017 18:39:31 -0700 Subject: [PATCH 163/237] minor fix --- src/ndarray/ndarray_function.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index 48407312fd2a..f047660debb5 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -218,7 +218,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadowPskv(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray From 6bb993321507a0cbb800c8e0cea95c0e541efaa8 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 31 Oct 2017 11:25:58 -0700 Subject: [PATCH 164/237] passing additional environment variables through launch.py --- tools/launch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/launch.py b/tools/launch.py index de42ea2a7dd3..6e6c46f1e5df 100755 --- a/tools/launch.py +++ b/tools/launch.py @@ -64,6 +64,9 @@ def main(): parser.add_argument('--launcher', type=str, default='ssh', choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'], help = 'the launcher to use') + parser.add_argument('--pass-env', type=str, default='', + help = 'given a comma separated list of environment \ + variables, passes their values while launching job') parser.add_argument('command', nargs='+', help = 'command for launching the program') args, unknown = parser.parse_known_args() From 2a7f2f509e47fa50a32489fa43bd6b83750acdb7 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 21:08:44 +0000 Subject: [PATCH 165/237] update local test --- omp_test.cpp | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/omp_test.cpp b/omp_test.cpp index 387bd1a6bfb5..e4657ce3c345 100644 --- a/omp_test.cpp +++ b/omp_test.cpp @@ -4,55 +4,63 @@ #include void quantize_2bit(float* data, float* res, float* compr, long long int size){ - #pragma omp parallel for - for(long long int i=0; i>4; i++) { float* compr_block = compr + i; *compr_block = 0; int s=i<<4, e=s+16; char* block_ptr = reinterpret_cast(compr_block); const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - char* curr_byte = block_ptr; + // char* curr_byte = block_ptr; for(int i=s; i= 0.5) { res[i] -= 0.5; - *curr_byte |= posbits[i&3]; + *block_ptr |= posbits[i&3]; } else if(res[i] <= -0.5) { res[i] += 0.5; - *curr_byte |= negbits[i&3]; + *block_ptr |= negbits[i&3]; } } } } + + int main() { std::cout<<"openmp max threads are "<>4;i++){ compr[i] = 0; - } */ + for(int j = i; j(t2-t1).count(); - std::cout << "time for " <(t2-t1).count(); - std::cout<< "time for quantizing "<(t4-t3).count(); + std::cout<< "time for quantizing "< Date: Tue, 31 Oct 2017 21:27:05 +0000 Subject: [PATCH 166/237] update dmlc with pass-env --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index 595d02c0e87b..be05e33f16ef 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39 +Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f From 7e5301dcfaf625e51b47760b0e18b9f079805ad5 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 22:22:58 +0000 Subject: [PATCH 167/237] fix launch pass env issue --- tools/launch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/launch.py b/tools/launch.py index 6e6c46f1e5df..0372a03bb350 100755 --- a/tools/launch.py +++ b/tools/launch.py @@ -35,7 +35,8 @@ def dmlc_opts(opts): '--num-servers', str(opts.num_servers), '--cluster', opts.launcher, '--host-file', opts.hostfile, - '--sync-dst-dir', opts.sync_dst_dir] + '--sync-dst-dir', opts.sync_dst_dir, + '--pass-env', opts.pass_env] args += opts.command; try: from dmlc_tracker import opts From 594b40cf5034fa3dc3bff10219c860abd0576edc Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 22:28:10 +0000 Subject: [PATCH 168/237] update with pass-env changes --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index be05e33f16ef..6b865af73853 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f +Subproject commit 6b865af73853d1d02099c964cfe3e677fcf93525 From 642cfe4025d4f82575492baef7da2d727bfd6af9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:01:50 -0700 Subject: [PATCH 169/237] fix operator increment of block, remove unncessary commented code --- dmlc-core | 2 +- nnvm | 2 +- python/mxnet/kvstore.py | 4 +- src/kvstore/comm.h | 11 +- src/operator/contrib/two_bit_quantize-inl.h | 125 +++----------------- src/operator/contrib/two_bit_quantize.cc | 97 +-------------- 6 files changed, 31 insertions(+), 210 deletions(-) diff --git a/dmlc-core b/dmlc-core index 6b865af73853..595d02c0e87b 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 6b865af73853d1d02099c964cfe3e677fcf93525 +Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39 diff --git a/nnvm b/nnvm index 8d79cfd0b42f..c86afa8f17a4 160000 --- a/nnvm +++ b/nnvm @@ -1 +1 @@ -Subproject commit 8d79cfd0b42fbe9f6ad75886d495065d5500b9dd +Subproject commit c86afa8f17a44bcd4e6eec41cd49ba87e4f7a635 diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 608797bea46c..274ad3a1ec18 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -427,13 +427,13 @@ def set_compress(self, compress_params=None): if not isinstance(compress_params['pos_threshold'], numeric_types): raise TypeError('pos_threshold must be a numeric type') else: - compress_params['pos_threshold'] = 0.1 + compress_params['pos_threshold'] = 0.5 if 'neg_threshold' in compress_params: if not isinstance(compress_params['neg_threshold'], numeric_types): raise TypeError('neg_threshold must be a numeric type') else: - compress_params['neg_threshold'] = -0.1 + compress_params['neg_threshold'] = -0.5 if compress_params['pos_threshold'] <= 0 or compress_params['neg_threshold'] >= 0: raise ValueError('pos_threshold needs to be greater than 0, \ diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index aa40c59c9125..5a8412fd7bf9 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -561,8 +561,8 @@ class CommDevice : public Comm { if (compress_ == "2bit") { int bits = 16; int64_t small_size = buf.merged.shape().Size() % bits == 0 ? - buf.merged.shape().Size() / bits + 3 : - buf.merged.shape().Size() / bits + 4; + buf.merged.shape().Size() / bits : + buf.merged.shape().Size() / bits + 1; buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), @@ -578,8 +578,8 @@ class CommDevice : public Comm { // this is done even if the data is on same context as copy_buf because // we don't want the training to be biased towards data on this GPU if (compress_ == "2bit") { -// Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, -// neg_threshold_, pos_threshold_, priority); + Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, + neg_threshold_, pos_threshold_, priority); if (buf.small_send_buf[i].ctx() != buf.small_recv_buf[i].ctx()) { CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); } else { @@ -587,7 +587,8 @@ class CommDevice : public Comm { buf.small_recv_buf[i] = buf.small_send_buf[i]; } // TODO (undo comment) -// Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), compress_, priority); + Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), + neg_threshold_, pos_threshold_, compress_, priority); } else { LOG(FATAL) << "Unsupported type of compression " << compress_; } diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 56e5b0b32e26..ced0fb4e3b45 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -48,11 +48,11 @@ struct TwoBitParam : public dmlc::Parameter { float pos_threshold, neg_threshold; DMLC_DECLARE_PARAMETER(TwoBitParam) { DMLC_DECLARE_FIELD(neg_threshold) - .set_default(-0.1) + .set_default(-0.5) .describe("Threshold to quantize negative values. " "Has to be less than 0"); DMLC_DECLARE_FIELD(pos_threshold) - .set_default(0.1) + .set_default(0.5) .describe("Threshold to quantize positive values. " "Has to be greater than 0"); } @@ -118,33 +118,30 @@ struct quantize_2bit { int start = out_block_id << 4; int end = start + 16; // <= original_size) ? start + 16 : original_size; char* block_ptr = reinterpret_cast < char* > (compr_block); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - char* curr_byte = block_ptr; for (int i = start; i < end && i < original_size; i++) { // // adds 1 when i-start divisible by 4 - curr_byte += ((i - start) & 3); + block_ptr += !(i & 3); residual[i] += grad[i]; if (residual[i] >= pos_threshold) { residual[i] -= pos_threshold; // set data to 11 - *curr_byte |= posbits[(i & 3)]; + *block_ptr|= posbits[(i & 3)]; } else if (residual[i] <= neg_threshold) { residual[i] -= neg_threshold; // set data to 10 - *curr_byte |= negbits[(i & 3)]; + *block_ptr |= negbits[(i & 3)]; } } } }; template -void Quantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); -// compress the data and calculate new residual across all mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array inputs[0].Size(), inputs[2].dptr(), // compressed array @@ -152,29 +149,9 @@ void Quantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs[1].dptr(), // residual array neg_threshold, // negative threshold pos_threshold); // positive threshold - std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration_cast(t2 - t1).count(); - std::cout << "quantizing " << inputs[0].Size() << " took " << dur << " ms" << std::endl; - - if (dur > 1000) { - NDArray *n = new NDArray(inputs[0], 0); - std::unique_ptr fo(dmlc::Stream::Create("quant_data", "w")); - mxnet::NDArray::Save(fo.get(), {*n}, {}); - } } // this function has been defined as quantize_2bit operator -template -void Quantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImplMShadow(s, inputs, param.neg_threshold, param.pos_threshold); -} - template void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -183,39 +160,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - float neg_threshold = param.neg_threshold; - float pos_threshold = param.pos_threshold; - int original_size = inputs[0].Size(); - float *out = inputs[2].dptr(); - float *grad = inputs[0].dptr(); - float *residual = inputs[1].dptr(); - for (int out_block_id=0; out_block_id (compr_block); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - - char *curr_byte = block_ptr; - for (int i = start; i < end && i < original_size; i++) { - // // adds 1 when i-start divisible by 4 - curr_byte += ((i - start) & 3); - residual[i] += grad[i]; - if (residual[i] >= pos_threshold) { - residual[i] -= pos_threshold; - // set data to 11 - *curr_byte |= posbits[(i & 3)]; - } else if (residual[i] <= neg_threshold) { - residual[i] -= neg_threshold; - // set data to 10 - *curr_byte |= negbits[(i & 3)]; - } - } - } + Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -267,8 +212,8 @@ struct dequantize_2bit { int out_start_id = compr_block_id<<4; float* outval = out + out_start_id; char* ch_ptr = reinterpret_cast(in + compr_block_id); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { ch_ptr += !(i & 3); int col = i & 3; @@ -289,21 +234,15 @@ struct dequantize_2bit { }; template -void Dequantize2BitImplMShadow(mshadow::Stream* s, const std::vector& inputs, +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - // Can only decompress the float32 data - - std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); int original_size = inputs[1].Size(); - mxnet_op::Kernel::Launch(s, original_size/16, // original size + mxnet_op::Kernel::Launch(s, original_size>>4, // original size original_size, inputs[1].dptr(), // out array inputs[0].dptr(), // compressed array neg_threshold, // negative threshold pos_threshold); // positive threshold - std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration_cast(t2-t1).count(); - std::cout<<"dequantizing "< @@ -313,42 +252,8 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); - int original_size = inputs[1].Size(); - float* out = inputs[1].dptr(); - float* in = inputs[0].dptr(); - for (int compr_block_id=0; compr_block_id(in + compr_block_id); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - ch_ptr += !(i & 3); - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = 0.5; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = -0.5; - } else { - *outval = 0; - } - } - } -} - -template -void Dequantize2BitComputeMShadow(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - Dequantize2BitImplMShadow(s, inputs, 0.5, 0.5); + const TwoBitParam& param = nnvm::get(attrs.parsed); + Dequantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 0554ecd733aa..b2086d88cf98 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -56,10 +56,8 @@ quantized into 2-bits '10'. The other elements will be quantized as '00'. Every 16 floats in the original array will be packed into one float variable in the output array. -In this example, 'out' has 4 elements. The first element stores the -neg_threshold (-4.0), the second element stores the pos_threshold (+4.0), the -third element stores the original size of the uncompressed array, and the -original array will be quantized into a single element in the last element. +In this example, 'out' has 1 element, which represents upto16 elements of +original array quantized into a single element. The residual is also updated to [1.0, -3.0, -1.0, -3.0]. )code" ADD_FILELINE) .set_num_inputs(3) @@ -78,54 +76,6 @@ The residual is also updated to [1.0, -3.0, -1.0, -3.0]. .add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") .add_arguments(TwoBitParam::__FIELDS__()); - NNVM_REGISTER_OP(_contrib_quantize_mshadow_2bit) - .describe(R"code(Quantize an input tensor into using 2bits for each value using -user-specified thresholds, while storing quantization error in residual array. - -The quantize_2bit operator takes 5 arguments and is called as follows: -`quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. -The operator modifies `residual` and `out` arrays. -The `out`variable will be the quantized array. Note that, `out` array can be generated by -invoking `create_2bit(array)`, avoiding calculation of size of quantized array. -This `out` array has first three elements as negative threshold, positive threshold, -and size of the original uncompressed array. Any elements after these three elements -represent quantized data. -The operation sums up array and residual, and then -applies the thresholds to quantize the data into one of three states -represented by 2bits. 16 such quantized floats in the original array -are packed together into one float in the `out` array. -The quantization error is stored in residual array. - -For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the -residual is [0.0, -2.0, 0, 1.0]. Let the negative and positive thresholds be --4.0 and +4.0, respectively. In this method, the elements whose -(gradient + residual) >= pos_threshold will be quantized into 2-bits '01', -and the elements whose (gradient + residual) <= neg_threshold will be -quantized into 2-bits '10'. The other elements will be quantized -as '00'. Every 16 floats in the original array will be packed -into one float variable in the output array. - -In this example, 'out' has 4 elements. The first element stores the -neg_threshold (-4.0), the second element stores the pos_threshold (+4.0), the -third element stores the original size of the uncompressed array, and the -original array will be quantized into a single element in the last element. -The residual is also updated to [1.0, -3.0, -1.0, -3.0]. -)code" ADD_FILELINE) - .set_num_inputs(3) - .set_num_outputs(0) - .set_attr_parser(ParamParser) - .set_attr("FInferShape", Quantize2BitShape) - .set_attr("FInferType", Quantize2BitType) - .set_attr("FCompute", Quantize2BitComputeMShadow) - .set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) - .set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - return std::vector{1, 2}; - }) - .add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") - .add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") - .add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") - .add_arguments(TwoBitParam::__FIELDS__()); NNVM_REGISTER_OP(_contrib_create_2bit) .describe(R"code(Generate an array with the right shape to store the input data after @@ -143,20 +93,14 @@ NNVM_REGISTER_OP(_contrib_dequantize_2bit) .describe(R"code(Dequantize an input tensor quantized by quantize_2bit. The dequantize_2bit operator takes two input arguments. The first input is a NDArray, -which has been generated by quantize_2bit(). This operator expects the first -three elements to be the negative threshold, positive threshold, and the size -of the original uncompressed array. Starting from the fourth element are expected to -be quantized values of the original array. +which has been generated by quantize_2bit(). The second input is a NDArray that has the same shape as the original array before quantizing. The operator replaces the contents of this array with dequantized data. - -In the example was described for quantize_2bit, -invoking dequantize_2bit(out, array), the 'array' argument will become -[4.0, 0, -4.0, 0], where -4.0 and 4.0 are the negative and positive thresholds. )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(0) +.set_attr_parser(ParamParser) .set_attr("FInferShape", Dequantize2BitShape) .set_attr("FInferType", Dequantize2BitType) .set_attr("FCompute", Dequantize2BitCompute) @@ -166,37 +110,8 @@ invoking dequantize_2bit(out, array), the 'array' argument will become return std::vector{1}; }) .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); - - - -NNVM_REGISTER_OP(_contrib_dequantize_mshadow_2bit) -.describe(R"code(Dequantize an input tensor quantized by quantize_2bit. - -The dequantize_2bit operator takes two input arguments. The first input is a NDArray, -which has been generated by quantize_2bit(). This operator expects the first -three elements to be the negative threshold, positive threshold, and the size -of the original uncompressed array. Starting from the fourth element are expected to -be quantized values of the original array. -The second input is a NDArray that has the same shape as the original -array before quantizing. The operator replaces the contents of this array -with dequantized data. +.add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") +.add_arguments(TwoBitParam::__FIELDS__()); -In the example was described for quantize_2bit, -invoking dequantize_2bit(out, array), the 'array' argument will become -[4.0, 0, -4.0, 0], where -4.0 and 4.0 are the negative and positive thresholds. -)code" ADD_FILELINE) - .set_num_inputs(2) - .set_num_outputs(0) - .set_attr("FInferShape", Dequantize2BitShape) - .set_attr("FInferType", Dequantize2BitType) - .set_attr("FCompute", Dequantize2BitComputeMShadow) - .set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_mshadow_2bit"}) - .set_attr("FMutateInputs", - [](const nnvm::NodeAttrs& attrs) { - return std::vector{1}; - }) - .add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") - .add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); } // namespace op } // namespace mxnet From 3c8686ad0480abecf00ec4aa99cf1f7bddb144b3 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:03:56 -0700 Subject: [PATCH 170/237] fix operator increment of block, remove unncessary commented code --- src/ndarray/ndarray_function.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index e47acc6c0735..6da0f58a7259 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -191,7 +191,7 @@ void ElementwiseSum(mshadow::Stream* s, template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } /* @@ -200,7 +200,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray From 483d61019ec719c50f25de4da62182f673593264 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:05:11 -0700 Subject: [PATCH 171/237] fix operator increment of block, remove unncessary commented code --- src/operator/contrib/two_bit_quantize.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu index 6482b16e1b37..b90ff1840771 100644 --- a/src/operator/contrib/two_bit_quantize.cu +++ b/src/operator/contrib/two_bit_quantize.cu @@ -33,12 +33,6 @@ NNVM_REGISTER_OP(_contrib_quantize_2bit) NNVM_REGISTER_OP(_contrib_dequantize_2bit) .set_attr("FCompute", Dequantize2BitCompute); -NNVM_REGISTER_OP(_contrib_quantize_mshadow_2bit) -.set_attr("FCompute", Quantize2BitComputeMShadow); - -NNVM_REGISTER_OP(_contrib_dequantize_mshadow_2bit) -.set_attr("FCompute", Dequantize2BitComputeMShadow); - NNVM_REGISTER_OP(_contrib_create_2bit) .set_attr("FCompute", Create2BitArrayCompute); From bc245b4885638412d528403f815094cc3cac94ae Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:06:21 -0700 Subject: [PATCH 172/237] fix operator increment of block, remove unncessary commented code --- src/ndarray/ndarray_function.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index f047660debb5..d5b519a836d9 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -209,7 +209,7 @@ void ElementwiseSum(mshadow::Stream* s, template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } /* @@ -218,7 +218,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray From 2f257e50c3cdde1bc937e08c0b2f9da2fa15fda7 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:06:21 -0700 Subject: [PATCH 173/237] bring back quantize Signed-off-by: Rahul --- src/kvstore/kvstore_dist.h | 45 +--------------- src/ndarray/ndarray.cc | 94 +++++++++++++++------------------ src/ndarray/ndarray_function.cu | 4 +- 3 files changed, 47 insertions(+), 96 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 64050339270a..3f5bdd593f45 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -315,7 +315,7 @@ class KVStoreDist : public KVStoreLocal { } if (compress_ == "2bit") { - QuantizeAll(comm_buf, &small_buf, &res_buf, + Quantize(comm_buf, &small_buf, &res_buf, compress_, neg_threshold_, pos_threshold_, priority); } else { LOG(FATAL) << "Unsupported quantization"; @@ -504,48 +504,7 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistCompressedPush")); } - void QuantizeAll(const NDArray &from, NDArray *to, NDArray *residual, - const std::string& compress, const float neg_threshold, const float pos_threshold, - int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - if (a == cpu::kDevMask && b == cpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { -#if MXNET_USE_CUDA - if (a == gpu::kDevMask && b == gpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { - LOG(FATAL) << "unknown device mask"; - } -#else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; -#endif - } - } + PSKV& EncodeKey(int key, size_t size, bool is_push) { if (compress_ != "none") { diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 3a8e295a89b6..16d4bc639820 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -557,57 +557,49 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { #endif } } -// -//void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, -// const float neg_threshold, const float pos_threshold, -// int priority) { -// CHECK(from.shape().ndim() != 0) -// << "source operands have zero dimension shape"; -// // important: callback must always capture by value -// NDArray ret = *to; -// NDArray res = *residual; -// int a = from.ctx().dev_mask(); -// int b = to->ctx().dev_mask(); -// if (a == cpu::kDevMask && b == cpu::kDevMask) { -// if (compress == "2bit") { -// Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { -// std::vector inputs(3); -// inputs[0] = from.data(); -// inputs[1] = res.data(); -// inputs[2] = ret.data(); -// mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, -// neg_threshold, pos_threshold); -// }, from.ctx(), {from.var()}, {ret.var(), res.var()}, -// FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); -// } else { -// LOG(FATAL) << "Unsupported Quantization"; -// } -// } else { -//#if MXNET_USE_CUDA -// if (a == gpu::kDevMask && b == gpu::kDevMask) { -// if (compress == "2bit") { -// Engine::Get()->PushSync([from, res, ret, neg_threshold, pos_threshold](RunContext ctx) { -// std::vector inputs(3); -// inputs[0] = from.data(); -// inputs[1] = res.data(); -// inputs[2] = ret.data(); -// mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, -// neg_threshold, pos_threshold); -// // Wait GPU kernel to complete -// ctx.get_stream()->Wait(); -// }, from.ctx(), {from.var()}, {ret.var(), res.var()}, -// FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); -// } else { -// LOG(FATAL) << "Unsupported Quantization"; -// } -// } else { -// LOG(FATAL) << "unknown device mask"; -// } -//#else -// LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; -//#endif -// } -// } + +void Quantize(const NDArray &from, NDArray *to, NDArray *residual, + const std::string& compress, const float neg_threshold, const float pos_threshold, + int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + if (a == cpu::kDevMask && b == cpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + neg_threshold, pos_threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { + LOG(FATAL) << "Unsupported Quantization"; + } + } else { +#if MXNET_USE_CUDA + if (a == gpu::kDevMask && b == gpu::kDevMask) { + if (compress == "2bit") { + Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, + neg_threshold, pos_threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "Unsupported Quantization"; + } + } else { + LOG(FATAL) << "unknown device mask"; + } +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } +} void Dequantize(const NDArray &from, NDArray *to, const float neg_threshold, const float pos_threshold, const std::string& compress, int priority) { diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index f047660debb5..d5b519a836d9 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -209,7 +209,7 @@ void ElementwiseSum(mshadow::Stream* s, template<> void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } /* @@ -218,7 +218,7 @@ void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& i template<> void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImplMShadow(s, inputs, neg_threshold, pos_threshold); + mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); } } // namespace ndarray From 46cbf5cd935722b521a56f6d5903b96c0f53da1f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 31 Oct 2017 16:20:10 -0700 Subject: [PATCH 174/237] fix test --- tests/python/unittest/test_operator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index fb3ca8290a69..c2006fed9838 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4351,7 +4351,7 @@ def compute_expected(arr, neg, pos, curr_residual): if len(str_quant)%16 != 0: str_quant += '0'*(16 - len(str_quant)%16) - compr = [neg, pos, len(arr)] + compr = [] # converts the string generated into integers 32chars at a time i = 0 while i Date: Tue, 31 Oct 2017 17:32:02 -0700 Subject: [PATCH 175/237] fix bug with increment of char pointer --- src/operator/contrib/two_bit_quantize-inl.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index ced0fb4e3b45..8c0e8b5f662e 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -33,6 +33,7 @@ #include #include #include "ps/ps.h" +#include namespace mxnet { namespace op { @@ -123,16 +124,16 @@ struct quantize_2bit { for (int i = start; i < end && i < original_size; i++) { // // adds 1 when i-start divisible by 4 - block_ptr += !(i & 3); + char* curr_byte = block_ptr + ((i-start)>>2); residual[i] += grad[i]; if (residual[i] >= pos_threshold) { residual[i] -= pos_threshold; // set data to 11 - *block_ptr|= posbits[(i & 3)]; + *curr_byte |= posbits[(i & 3)]; } else if (residual[i] <= neg_threshold) { residual[i] -= neg_threshold; // set data to 10 - *block_ptr |= negbits[(i & 3)]; + *curr_byte |= negbits[(i & 3)]; } } } @@ -215,11 +216,11 @@ struct dequantize_2bit { const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - ch_ptr += !(i & 3); + char* curr_byte = ch_ptr + ((i-out_start_id)>>2); int col = i & 3; uint8_t mask = posbits[col]; uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; + uint8_t masked = *curr_byte & mask; if ( masked == mask ) { *outval = pos_threshold; } // use posbits for mask as posbits are 11 From c84af06dc7eb210381f7c525a2b25ebc69d6f23c Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 1 Nov 2017 16:12:57 -0700 Subject: [PATCH 176/237] fix bug with increment of char pointer --- example/image-classification/common/fit.py | 9 ++-- src/kvstore/kvstore_dist.h | 49 +++++++++++++++++++-- src/kvstore/kvstore_dist_server.h | 23 ++++++++++ src/operator/contrib/two_bit_quantize-inl.h | 49 +++++++++++---------- tests/nightly/dist_sync_kvstore.py | 2 +- 5 files changed, 102 insertions(+), 30 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index b82a147c88d9..c863c16a0c92 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -103,8 +103,11 @@ def add_fit_args(parser): help='1 means test reading speed without training') train.add_argument('--dtype', type=str, default='float32', help='precision: float32 or float16') - train.add_argument('--compress', type=str, default='none') - train.add_argument('--threshold', type=float, default=0.5) + train.add_argument('--gc-type', type=str, default='none', + help='type of gradient compression to use, \ + takes `2bit` or `none` for now') + train.add_argument('--gc-threshold', type=float, default=0.5, + help='threshold for 2bit gradient compression') return train def fit(args, network, data_loader, **kwargs): @@ -116,7 +119,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_compress({'compress':args.compress, 'pos_threshold':args.threshold, 'neg_threshold':-1*args.threshold}) + kv.set_compress({'compress':args.gc_type, 'pos_threshold':args.gc_threshold, 'neg_threshold':-1*args.gc_threshold}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 3f5bdd593f45..ad45702f4297 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -272,12 +272,47 @@ class KVStoreDist : public KVStoreLocal { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; + if(count_save.count(key)==0) { + count_save[key] = 0; + } else { + count_save[key]++; + } + const auto& vals = grouped_vals[i]; + if(count_save[key]<3 && get_rank()==0){ + {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals0_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + vals[0].WaitToRead(); + mxnet::NDArray::Save(fo.get(), {vals[0]},{}); + } + if(vals.size()>=2) { + std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals1_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + vals[1].WaitToRead(); + mxnet::NDArray::Save(fo.get(), {vals[1]},{}); + } + + if(vals.size()>=3) { + std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals2_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + vals[2].WaitToRead(); + mxnet::NDArray::Save(fo.get(), {vals[2]},{}); + } + if(vals.size()>=4) {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals3_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + vals[3].WaitToRead(); + mxnet::NDArray::Save(fo.get(), {vals[3]},{}); + } + } + NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { + + if(count_save[key]<3 && get_rank()==0) { + std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_merged_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + merged.WaitToRead(); + mxnet::NDArray::Save(fo.get(), {merged},{}); + } + // Start of a push doesn't guarantee that the previous pushes are completed. // This shouldn't affect training of networks though because training involves // a sequence of push, pull, then push. This imposes ordering that the @@ -293,6 +328,12 @@ class KVStoreDist : public KVStoreLocal { } CopyFromTo(merged, &comm_buf); } + + if(count_save[key]<3 && get_rank()==0) { + std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_data_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); + comm_buf.WaitToRead(); + mxnet::NDArray::Save(fo.get(), {comm_buf},{}); + } if (compress_ != "none") { auto &small_buf = compr_buf_[key]; @@ -314,9 +355,9 @@ class KVStoreDist : public KVStoreLocal { } } - if (compress_ == "2bit") { - Quantize(comm_buf, &small_buf, &res_buf, - compress_, neg_threshold_, pos_threshold_, priority); + if (compress_ == "2bit") { + Quantize(comm_buf, &small_buf, &res_buf, + compress_, neg_threshold_, pos_threshold_, priority);//neg_threshold_, pos_threshold_, priority); } else { LOG(FATAL) << "Unsupported quantization"; } @@ -746,6 +787,8 @@ class KVStoreDist : public KVStoreLocal { std::unordered_map residual_; bool log_verbose_; + + std::unordered_map count_save; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 47e241e1a542..8ed55d7f5ed9 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -381,6 +381,12 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[1]); auto& stored = store_[key]; + if (count_save.count(key)==0) { + count_save[key] = 0; + } else { + count_save[key]++; + } + size_t ds[] = {(size_t)req_data.lens[1]}; TShape dshape(ds, ds + 1); TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) @@ -398,9 +404,25 @@ class KVStoreDistServer { if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); + { + std::unique_ptr fo( + dmlc::Stream::Create((compress_ + "server_recved_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); + recved.WaitToRead(); + mxnet::NDArray::Save(fo.get(), {recved}, {}); + } + Dequantize(recved, &stored, neg_threshold, pos_threshold, compress_, 0); + server->Response(req_meta); stored.WaitToRead(); + + + { + std::unique_ptr fo( + dmlc::Stream::Create((compress_ + "server_stored_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); + stored.WaitToRead(); + mxnet::NDArray::Save(fo.get(), {stored}, {}); + } } else if (sync_mode_) { // synced push auto& merged = merge_buf_[key]; @@ -541,6 +563,7 @@ class KVStoreDistServer { float pos_threshold = 0.5; float neg_threshold = -0.5; + std::unordered_map count_save; }; } // namespace kvstore diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 8c0e8b5f662e..48e345a63848 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -130,10 +130,14 @@ struct quantize_2bit { residual[i] -= pos_threshold; // set data to 11 *curr_byte |= posbits[(i & 3)]; +// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)<(in + compr_block_id); + float* outval = out + i; + char* ch_ptr = reinterpret_cast(in + (i>>4)); + +// std::cout<(*ch_ptr)<<" " <(*(ch_ptr+1))<<" "<(*(ch_ptr+2))<<" "<(*(ch_ptr+3))<> 2 ); const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = out_start_id; (i < out_start_id + 16) && (i < original_size); ++i, ++outval ) { - char* curr_byte = ch_ptr + ((i-out_start_id)>>2); - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *curr_byte & mask; - if ( masked == mask ) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = neg_threshold; - } else { - *outval = 0; - } + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = pos_threshold; +// std::cout<(*ch_ptr)<< " "<(masked)<< " "<(*ch_ptr)<< " "<(masked)<< " "<(*ch_ptr)<< " "<(masked)<< " 0"< void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float neg_threshold, const float pos_threshold) { - int original_size = inputs[1].Size(); - mxnet_op::Kernel::Launch(s, original_size>>4, // original size - original_size, + mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr(), // compressed array neg_threshold, // negative threshold diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 99cc38bdda1c..a374db319bf4 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -244,7 +244,7 @@ def check_compr_random(kv, pos, neg, nworker): # calculate expected value after pull mx.random.seed(123) rnd.seed(123) - for k,s in [('112221',irregular_shape),('11221', big_shape), ('1121', shape)]: + for k,s in [('1121', shape)]:#,('112221',irregular_shape),('11221', big_shape)]: orig_val = mx.nd.zeros(s) kv.pull(k, orig_val) From d316700481efe5f828a0d4e39e7644eff82eab5d Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 1 Nov 2017 23:15:02 +0000 Subject: [PATCH 177/237] debug module --- python/mxnet/model.py | 8 +++++++- python/mxnet/module/base_module.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 2444ca0dc59e..3a0ab5416d78 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -98,9 +98,15 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_o for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] kvstore.init(name, arg_params[name]) - + fla = 0 + if arg_params[name].sum()[0] != 0: + fla=1 + print(name, arg_params[name]) if update_on_kvstore: kvstore.pull(name, param_on_devs, priority=-idx) + nd.waitall() + if fla==1: + print('pulled ',param_on_devs.sum(), param_on_devs) def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): """Perform update of param_arrays from grad_arrays on kvstore.""" diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py index bae166e3ffd8..59005a423c6b 100644 --- a/python/mxnet/module/base_module.py +++ b/python/mxnet/module/base_module.py @@ -469,7 +469,8 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) - + ndarray.waitall() + return ################################################################################ # training loop ################################################################################ From e8aa9b5e8c3e103fffaa976d7322218bd8c18d43 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 1 Nov 2017 16:44:18 -0700 Subject: [PATCH 178/237] update test --- tests/python/unittest/test_operator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index c2006fed9838..a73497030c33 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4313,7 +4313,7 @@ def test_two_bit_quantization(): neg_threshold = -0.5 pos_threshold = 0.5 orig_shape = [(25,),(16,),(1121),(14400),(144000),(144000)] - num_repeat = 1 + num_repeat = 10 from struct import pack,unpack def bits2int(bits): @@ -4336,11 +4336,11 @@ def compute_expected(arr, neg, pos, curr_residual): for i, a in np.ndenumerate(arr_npy): a += curr_res_npy[i] if a >= pos: - str_quant += '10' + str_quant += '11' new_residual.append(a - pos) decompr.append(pos) elif a <= neg: - str_quant += '01' + str_quant += '10' new_residual.append(a - neg) decompr.append(neg) else: @@ -4366,8 +4366,8 @@ def check(grad, residual): mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) decompr = mx.nd.zeros(grad.shape) mx.contrib.ndarray.dequantize_2bit(compr, decompr, neg_threshold, pos_threshold) - assert np.array_equal(compr.asnumpy(), np.array(exp_compr)) , (compr, exp_compr) - assert np.array_equal(decompr.asnumpy(), np.array(exp_decompr)) , (decompr, exp_decompr) + np.testing.assert_array_equal(compr.asnumpy(), np.array(exp_compr)) , (compr, exp_compr) + np.testing.assert_array_equal(decompr.asnumpy(), np.array(exp_decompr)) , (decompr, exp_decompr) # use almost equal for residual as this involves addition operation assert_almost_equal(residual.asnumpy(), np.array(exp_residual)) , (residual, exp_residual) From 5f130dd359e6af7e571c3846c2f743e269b642c0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 1 Nov 2017 16:59:33 -0700 Subject: [PATCH 179/237] comment all debug statements --- src/kvstore/kvstore_dist.h | 74 +++++++++++++++---------------- src/kvstore/kvstore_dist_server.h | 37 ++++++++-------- 2 files changed, 55 insertions(+), 56 deletions(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index ad45702f4297..2377d67018ab 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -272,34 +272,34 @@ class KVStoreDist : public KVStoreLocal { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; - if(count_save.count(key)==0) { - count_save[key] = 0; - } else { - count_save[key]++; - } +// if(count_save.count(key)==0) { +// count_save[key] = 0; +// } else { +// count_save[key]++; +// } const auto& vals = grouped_vals[i]; - if(count_save[key]<3 && get_rank()==0){ - {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals0_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - vals[0].WaitToRead(); - mxnet::NDArray::Save(fo.get(), {vals[0]},{}); - } - if(vals.size()>=2) { - std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals1_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - vals[1].WaitToRead(); - mxnet::NDArray::Save(fo.get(), {vals[1]},{}); - } - - if(vals.size()>=3) { - std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals2_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - vals[2].WaitToRead(); - mxnet::NDArray::Save(fo.get(), {vals[2]},{}); - } - if(vals.size()>=4) {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals3_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - vals[3].WaitToRead(); - mxnet::NDArray::Save(fo.get(), {vals[3]},{}); - } - } +// if(count_save[key]<3 && get_rank()==0){ +// {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals0_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// vals[0].WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {vals[0]},{}); +// } +// if(vals.size()>=2) { +// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals1_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// vals[1].WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {vals[1]},{}); +// } +// +// if(vals.size()>=3) { +// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals2_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// vals[2].WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {vals[2]},{}); +// } +// if(vals.size()>=4) {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals3_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// vals[3].WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {vals[3]},{}); +// } +// } NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; @@ -307,11 +307,11 @@ class KVStoreDist : public KVStoreLocal { auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { - if(count_save[key]<3 && get_rank()==0) { - std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_merged_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - merged.WaitToRead(); - mxnet::NDArray::Save(fo.get(), {merged},{}); - } +// if(count_save[key]<3 && get_rank()==0) { +// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_merged_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// merged.WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {merged},{}); +// } // Start of a push doesn't guarantee that the previous pushes are completed. // This shouldn't affect training of networks though because training involves @@ -329,11 +329,11 @@ class KVStoreDist : public KVStoreLocal { CopyFromTo(merged, &comm_buf); } - if(count_save[key]<3 && get_rank()==0) { - std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_data_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); - comm_buf.WaitToRead(); - mxnet::NDArray::Save(fo.get(), {comm_buf},{}); - } +// if(count_save[key]<3 && get_rank()==0) { +// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_data_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); +// comm_buf.WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {comm_buf},{}); +// } if (compress_ != "none") { auto &small_buf = compr_buf_[key]; @@ -788,7 +788,7 @@ class KVStoreDist : public KVStoreLocal { bool log_verbose_; - std::unordered_map count_save; +// std::unordered_map count_save; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 8ed55d7f5ed9..6253d55dbe27 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -381,11 +381,11 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[1]); auto& stored = store_[key]; - if (count_save.count(key)==0) { - count_save[key] = 0; - } else { - count_save[key]++; - } +// if (count_save.count(key)==0) { +// count_save[key] = 0; +// } else { +// count_save[key]++; +// } size_t ds[] = {(size_t)req_data.lens[1]}; TShape dshape(ds, ds + 1); @@ -404,25 +404,24 @@ class KVStoreDistServer { if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); - { - std::unique_ptr fo( - dmlc::Stream::Create((compress_ + "server_recved_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); - recved.WaitToRead(); - mxnet::NDArray::Save(fo.get(), {recved}, {}); - } +// { +// std::unique_ptr fo( +// dmlc::Stream::Create((compress_ + "server_recved_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); +// recved.WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {recved}, {}); +// } Dequantize(recved, &stored, neg_threshold, pos_threshold, compress_, 0); server->Response(req_meta); stored.WaitToRead(); - - { - std::unique_ptr fo( - dmlc::Stream::Create((compress_ + "server_stored_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); - stored.WaitToRead(); - mxnet::NDArray::Save(fo.get(), {stored}, {}); - } +// { +// std::unique_ptr fo( +// dmlc::Stream::Create((compress_ + "server_stored_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); +// stored.WaitToRead(); +// mxnet::NDArray::Save(fo.get(), {stored}, {}); +// } } else if (sync_mode_) { // synced push auto& merged = merge_buf_[key]; @@ -563,7 +562,7 @@ class KVStoreDistServer { float pos_threshold = 0.5; float neg_threshold = -0.5; - std::unordered_map count_save; +// std::unordered_map count_save; }; } // namespace kvstore From c1fbeb78599ee2187ae0548b8249ba983c4de03b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 2 Nov 2017 00:00:26 +0000 Subject: [PATCH 180/237] change init to normal for now --- example/image-classification/common/fit.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index c863c16a0c92..742b2e619ddd 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -179,12 +179,12 @@ def fit(args, network, data_loader, **kwargs): monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None - if args.network == 'alexnet': + # if args.network == 'alexnet': # AlexNet will not converge using Xavier - initializer = mx.init.Normal() - else: - initializer = mx.init.Xavier( - rnd_type='gaussian', factor_type="in", magnitude=2) + initializer = mx.init.Normal(0.75) +# else: + # initializer = mx.init.Xavier( + # rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), # evaluation metrices From 4e0bdedf4afeb00ac8ca5030c5ffaf6c1d91998c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 1 Nov 2017 17:04:06 -0700 Subject: [PATCH 181/237] remove debug changes --- python/mxnet/model.py | 14 +++++++------- python/mxnet/module/base_module.py | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 3a0ab5416d78..4f3dc9c79d68 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -98,15 +98,15 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_o for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] kvstore.init(name, arg_params[name]) - fla = 0 - if arg_params[name].sum()[0] != 0: - fla=1 - print(name, arg_params[name]) + #fla = 0 + #if arg_params[name].sum()[0] != 0: + # fla=1 + # print(name, arg_params[name]) if update_on_kvstore: kvstore.pull(name, param_on_devs, priority=-idx) - nd.waitall() - if fla==1: - print('pulled ',param_on_devs.sum(), param_on_devs) + # nd.waitall() + # if fla==1: + # print('pulled ',param_on_devs.sum(), param_on_devs) def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): """Perform update of param_arrays from grad_arrays on kvstore.""" diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py index 59005a423c6b..bae166e3ffd8 100644 --- a/python/mxnet/module/base_module.py +++ b/python/mxnet/module/base_module.py @@ -469,8 +469,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc', validation_metric = eval_metric if not isinstance(eval_metric, metric.EvalMetric): eval_metric = metric.create(eval_metric) - ndarray.waitall() - return + ################################################################################ # training loop ################################################################################ From 8a083d2d33bb0cc961d608bfa1466153cbfe8e25 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 2 Nov 2017 17:57:31 -0700 Subject: [PATCH 182/237] reorg to create gc class, add delayed start to gc, untested: causing segfault --- CMakeLists.txt | 2 +- example/image-classification/common/fit.py | 13 +- include/mxnet/c_api.h | 12 +- include/mxnet/kvstore.h | 27 +- include/mxnet/ndarray.h | 27 -- python/mxnet/gluon/trainer.py | 20 +- python/mxnet/kvstore.py | 101 +++-- python/mxnet/model.py | 7 - python/mxnet/module/module.py | 17 +- src/c_api/c_api.cc | 12 +- src/io/image_io.cc | 2 +- src/kvstore/comm.h | 157 ++++---- src/kvstore/gc.cpp | 5 + src/kvstore/gc.h | 199 ++++++++++ src/kvstore/kvstore_dist.h | 419 +++++++++----------- src/kvstore/kvstore_dist_server.h | 87 ++-- src/kvstore/kvstore_local.h | 16 +- src/ndarray/ndarray.cc | 84 ---- src/ndarray/ndarray_function.cc | 19 - src/ndarray/ndarray_function.cu | 19 - src/ndarray/ndarray_function.h | 17 +- src/operator/contrib/two.rahul | 340 ---------------- src/operator/contrib/two_bit_quantize-inl.h | 59 ++- src/operator/contrib/two_bit_quantize.cc | 21 +- tests/cpp/operator/quantize_perf.cc | 2 +- tests/nightly/dist_sync_kvstore.py | 37 +- tests/nightly/test_kvstore.py | 28 +- tests/python/unittest/test_operator.py | 27 +- tools/bandwidth/measure.py | 7 +- 29 files changed, 671 insertions(+), 1112 deletions(-) create mode 100644 src/kvstore/gc.cpp create mode 100644 src/kvstore/gc.h delete mode 100644 src/operator/contrib/two.rahul diff --git a/CMakeLists.txt b/CMakeLists.txt index 63bc8d740b74..4af93b8b4e76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -435,7 +435,7 @@ else() target_link_libraries(mxnet PRIVATE "-Wl,--whole-archive $ -Wl,--no-whole-archive") target_link_libraries(mxnet PRIVATE mxnet_static) # Let cmake understand the dependency else() - add_library(mxnet SHARED ${SOURCE}) + add_library(mxnet SHARED ${SOURCE} src/kvstore/gc.cpp src/kvstore/gc.h) endif() endif() diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 742b2e619ddd..2d02fece1ca9 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -119,7 +119,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_compress({'compress':args.gc_type, 'pos_threshold':args.gc_threshold, 'neg_threshold':-1*args.gc_threshold}) + kv.set_gradient_compression({'compress':args.gc_type, 'threshold':args.gc_threshold}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) @@ -179,13 +179,12 @@ def fit(args, network, data_loader, **kwargs): monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None - # if args.network == 'alexnet': + if args.network == 'alexnet': # AlexNet will not converge using Xavier - initializer = mx.init.Normal(0.75) -# else: - # initializer = mx.init.Xavier( - # rnd_type='gaussian', factor_type="in", magnitude=2) - # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + initializer = mx.init.Normal(0.5) + else: + initializer = mx.init.Xavier(rnd_type='gaussian', + factor_type="in", magnitude=2) # evaluation metrices eval_metrics = ['accuracy'] diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 652a7d4167fc..cc821ca86221 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1534,15 +1534,13 @@ MXNET_DLL int MXKVStoreCreate(const char *type, /*! * \brief Set parameters to use low-bit compressed gradients * \param handle handle to the kvstore - * \param compress type of compression - * \param neg_threshold set the negative threshold for 2bit compression - * \param pos_threshold set the positive threshold for 2bit compression + * \param compression type of compression + * \param threshold set the threshold for 2bit compression * \return 0 when success, -1 when failure happens */ -MXNET_DLL int MXKVStoreSetCompress(KVStoreHandle handle, - const char *compress, - const float neg_threshold, - const float pos_threshold); +MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle, + const char *compression, + const float threshold); /*! * \brief Delete a KVStore handle. diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 75fd49c19bb2..4dd0384ed66b 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,6 +30,7 @@ #include #include #include +#include #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" @@ -66,12 +67,11 @@ class KVStore { /** * \brief Set parameters to use low-bit compressed gradients - * \param compress type of compression - * \param neg_threshold set the negative threshold for 2bit compression - * \param pos_threshold set the positive threshold for 2bit compression + * \param compression_type type of compression + * \param threshold set the threshold for 2bit compression */ - virtual void SetCompress(const std::string& compress, const float neg_threshold, - const float pos_threshold) = 0; + virtual void SetGradientCompression(const std::string& compression_type, + const float threshold) = 0; /*! * \brief Initialize a list of key-value pair to the store. @@ -396,21 +396,12 @@ class KVStore { */ std::string type_; - /** - * \brief Specifies whether or not to use compressed gradients - * Can be `none` or `2bit` for now - */ - std::string compress_ = "none"; - - /** - * \brief positive threshold for 2bit compression + /** \brief gradient compression + * starts with none, used after SetGradientCompression sets the type + * currently there is no support for unsetting gradient compression */ - float pos_threshold_ = 0.5; + Gc* gc_; - /** - * \brief negative threshold for 2bit compression - */ - float neg_threshold_ = -0.5; /** * \brief whether to do barrier when finalize diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 215707914d03..458a3e78d077 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -893,33 +893,6 @@ size_t num_aux_data(NDArrayStorageType stype); */ void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0); -/*! - * \brief Issue quantize operation to be scheduled by the engine - * Compresses `from` into `to` and accumulates the quantization error - * into 'residual' - * \param from the ndarray containing original data to be compressed - * \param to the target ndarray which contains compressed data - * \param residual the ndarray which accumulates quantization error - * \param compress type of compression - * \param neg_threshold negative threshold for 2bit quantization - * \param pos_threshold positive threshold for 2bit quantization - * \param priority Priority of the action. - */ -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, const std::string& compress, - const float neg_threshold, const float pos_threshold, - int priority); - -/*! - * \brief Issue dequantize operation to be scheduled by the engine - * Dequantizes data in `from` into `to` - * \param from the ndarray containing compressed data - * \param to the target ndarray which contains original data - * \param compress type of compression - * \param priority Priority of the action. - */ -void Dequantize(const NDArray &from, NDArray *to, - const float neg_threshold, const float pos_threshold, const std::string& compress, int priority); - /*! * \brief issue an copy operation from one NDArray to another * the two ndarray can sit on different devices diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index d13691723a64..e58e605a87fd 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -44,14 +44,11 @@ class Trainer(object): kvstore : str or KVStore kvstore type for multi-gpu and distributed training. See help on :any:`mxnet.kvstore.create` for more information. - compress_params : dict + compression_params : dict Specifies type of gradient compression and additional arguments depending - on the type of compression being used. - For example, 2bit compression requires a positive threshold and negative threshold. - So to completely the arguments for 2bit compression, we would need to pass - a dictionary like the following. - {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} - See mxnet.KVStore.set_compress method for more details on gradient compression. + on the type of compression being used. For example, 2bit compression requires a threshold. + Arguments would then be {'compression':'2bit', 'threshold':0.5} + See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. Properties ---------- @@ -60,7 +57,7 @@ class Trainer(object): optimizer, its learning rate can be accessed as optimizer.learning_rate. """ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', - compress_params=None): + compression_params=None): if isinstance(params, (dict, ParameterDict)): params = list(params.values()) if not isinstance(params, (list, tuple)): @@ -74,10 +71,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device', "First argument must be a list or dict of Parameters, " \ "got list of %s."%(type(param))) self._params.append(param) - if compress_params: - if not isinstance(compress_params, dict): - raise ValueError("compress_params needs to be a dictionary") - self._compress_params = compress_params if compress_params else {'compress':'none'} + self._compression_params = compression_params optimizer_params = optimizer_params if optimizer_params else {} self._scale = optimizer_params.get('rescale_grad', 1.0) self._contexts = self._check_contexts() @@ -116,7 +110,7 @@ def _init_kvstore(self): kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) if kvstore: - kvstore.set_compress(self._compress_params) + kvstore.set_gradient_compression(self._compression_params) if 'dist' in kvstore.type: update_on_kvstore = False for i, param in enumerate(self._params): diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 274ad3a1ec18..119e08ff7fb9 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -349,24 +349,24 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): check_call(_LIB.MXKVStorePullRowSparse( self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_compress(self, compress_params=None): + def set_gradient_compression(self, compression_params=None): """ Specifies type of low-bit quantization for gradient compression if any, and additional arguments depending on the type of compression being used. Parameters ---------- - compress_params : dict - `compress_params` is a dictionary specifying the type and parameters - for gradient compression. The key `compress` in this dictionary is a required argument + compression_params : dict + `compression_params` is a dictionary specifying the type and parameters + for gradient compression. The key `compression` in this dictionary is a required argument and specifies the type of gradient compression. Other keys in this dictionary are optional and specific to the type of gradient compression. - 2bit Gradient Compression - --------- - 2bit gradient compression takes two thresholds, one for positive values and - other for negative thresholds. This works by limiting positive values in the - gradient to the positive threshold, and limiting negative values to the - negative threshold. Values which don't meet the thresholds are set to 0. + 2bit Gradient Compression: + + 2bit gradient compression takes a threshold. This needs to be a positive float. + The technique works by limiting values such that the absolute values of the gradient + communicated is less than the threshold. Values which don't meet the threshold + are set to 0. By doing so, each value in the gradient is in one of three states. 2bits are used to represent these states, and every 16 float values in the original gradient can be represented using one float. This compressed representation @@ -390,59 +390,48 @@ def set_compress(self, compress_params=None): not compressed. Server to worker communication (in the case of pull) is also not compressed. - To use 2bit compression, we need to specify `compress` as `2bit`. - Only specifying `compress` would use default values - for the other arguments of thresholds. + To use 2bit compression, we need to specify `compression` as `2bit`. + Only specifying `compression` would use default value for the threshold. To completely specify the arguments for 2bit compression, we would need to pass - a dictionary which includes `positive_threshold` and `negative_threshold` like: - {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} - compress: str + a dictionary which includes `threshold` like: + {'compression':'2bit', 'threshold':0.5} + + compression: str type of low-bit quantization to be used for gradient compression Can only be '2bit' for now. 2bit gradient compression uses 2bit quantization with residual to compress gradients. It works by converts each value in the original gradient to use 2 bits, causing size of gradient to be 1/16th of the original gradient - (and 3 floats of meta information). - pos_threshold: float - positive threshold used for 2bit quantization of gradients - Positive values in gradient above positive threshold will be set to - positive threshold. Positive values lesser than positive threshold will - be set to 0. - neg_threshold: float - negative threshold used for 2bit quantization of gradients - Negative values in gradient less than negative threshold will be set to - negative threshold. Negative values greater than negative threshold will - be set to 0. + threshold: float + must be greater than 0 + threshold used for 2bit quantization of gradients + Positive values in gradient above threshold will be set to + threshold. Negative values whose absolute values are higher than threshold, + will be set to the negative of threshold. Values whose absolute values are + less than threshold will be set to 0. """ - compress_params = compress_params if compress_params else {'compress':'none'} - if 'compress' not in compress_params: - raise ValueError('compress_params requires compress to be set') - elif not isinstance(compress_params['compress'], string_types): - raise TypeError('compress must be a string') - elif compress_params['compress'] not in ['none', '2bit']: - raise ValueError('Unsupported type of compression') - - if compress_params['compress'] == '2bit': - if 'pos_threshold' in compress_params: - if not isinstance(compress_params['pos_threshold'], numeric_types): - raise TypeError('pos_threshold must be a numeric type') - else: - compress_params['pos_threshold'] = 0.5 - - if 'neg_threshold' in compress_params: - if not isinstance(compress_params['neg_threshold'], numeric_types): - raise TypeError('neg_threshold must be a numeric type') - else: - compress_params['neg_threshold'] = -0.5 - - if compress_params['pos_threshold'] <= 0 or compress_params['neg_threshold'] >= 0: - raise ValueError('pos_threshold needs to be greater than 0, \ - and neg_threshold needs to be less than 0') - - check_call(_LIB.MXKVStoreSetCompress(self.handle, - c_str(compress_params['compress']), - mx_float(compress_params['neg_threshold']), - mx_float(compress_params['pos_threshold']))) + if compression_params: + if not isinstance(compression_params, dict): + raise ValueError("compression_params needs to be a dictionary") + if 'compression' not in compression_params: + raise ValueError('compression_params requires `compression` to be set') + elif not isinstance(compression_params['compression'], string_types): + raise TypeError('compression must be a string') + elif compression_params['compression'] not in ['2bit']: + raise ValueError('Unsupported type of compression') + + if compression_params['compression'] == '2bit': + if 'threshold' in compression_params: + if not isinstance(compression_params['threshold'], numeric_types): + raise TypeError('threshold must be a numeric type') + if compression_params['threshold'] <= 0: + raise ValueError('threshold must be greater than 0') + else: + compression_params['threshold'] = 0.5 + + check_call(_LIB.MXKVStoreSetGradientCompression(self.handle, + c_str(compression_params['compression']), + mx_float(compression_params['threshold']))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 4f3dc9c79d68..7f7b272fcdeb 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -98,15 +98,8 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_o for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] kvstore.init(name, arg_params[name]) - #fla = 0 - #if arg_params[name].sum()[0] != 0: - # fla=1 - # print(name, arg_params[name]) if update_on_kvstore: kvstore.pull(name, param_on_devs, priority=-idx) - # nd.waitall() - # if fla==1: - # print('pulled ',param_on_devs.sum(), param_on_devs) def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names): """Perform update of param_arrays from grad_arrays on kvstore.""" diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 310443a89aae..2daa0cb63925 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -59,19 +59,16 @@ class Module(BaseModule): state_names : list of str states are similar to data and label, but not provided by data iterator. Instead they are initialized to 0 and can be set by `set_states()`. - compress_params : dict + compression_params : dict Specifies type of gradient compression and additional arguments depending - on the type of compression being used. - For example, 2bit compression requires a positive threshold and negative threshold. - So to completely the arguments for 2bit compression, we would need to pass - a dictionary like the following. - {'compress':'2bit', 'positive_threshold':0.5, 'negative_threshold':-0.5} - See mxnet.KVStore.set_compress method for more details on gradient compression. + on the type of compression being used. For example, 2bit compression requires a threshold. + Arguments would then be {'compression':'2bit', 'threshold':0.5} + See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), logger=logging, context=ctx.cpu(), work_load_list=None, - fixed_param_names=None, state_names=None, compress_params=None): + fixed_param_names=None, state_names=None, compression_params=None): super(Module, self).__init__(logger=logger) if isinstance(context, ctx.Context): @@ -108,7 +105,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), self._aux_params = None self._params_dirty = False - self._compress_params = compress_params if compress_params else {'compress':'none'} + self._compression_params = compression_params self._optimizer = None self._kvstore = None self._update_on_kvstore = None @@ -531,7 +528,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', self._updater = None if kvstore: - kvstore.set_compress(self._compress_params) + kvstore.set_gradient_compression(self._compression_params) # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 76cf046d57d6..78fdd8eabee8 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -733,14 +733,10 @@ int MXKVStoreCreate(const char *type, API_END(); } -int MXKVStoreSetCompress(KVStoreHandle handle, - const char *compress, - const float neg_threshold, - const float pos_threshold) { - API_BEGIN(); - static_cast(handle)->SetCompress(compress, - neg_threshold, - pos_threshold); +int MXKVStoreSetGradientCompression(KVStoreHandle handle, + const char *compress, const float threshold) { + API_BEGIN(); + static_cast(handle)->SetGradientCompression(compress, threshold); API_END(); } diff --git a/src/io/image_io.cc b/src/io/image_io.cc index 9081a3734bc4..f5d6e3f54579 100644 --- a/src/io/image_io.cc +++ b/src/io/image_io.cc @@ -236,7 +236,7 @@ void Imread(const nnvm::NodeAttrs& attrs, Engine::Get()->PushSync([ndout, buff, fsize, param](RunContext ctx){ ImdecodeImpl(param.flag, param.to_rgb, buff, fsize, const_cast(&ndout)); - delete buff; + delete[] buff; }, ndout.ctx(), {}, {ndout.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("Imread")); #else diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 5a8412fd7bf9..c927962e5f4d 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -83,31 +83,15 @@ class Comm { * \brief Sets gradient compression parameters to be able to * perform reduce with compressed gradients */ - inline void SetCompress(const std::string& compress, - const float neg_threshold, - const float pos_threshold) { - compress_ = compress; - pos_threshold_ = pos_threshold; - neg_threshold_ = neg_threshold; + void SetGradientCompression(Gc* gc) { + gc_ = gc; } protected: Context pinned_ctx_; - /* - * \brief Sets type of gradient compression - */ - std::string compress_ = "none"; - - /* - * \brief sets positive threshold for 2bit gradient compression - */ - float pos_threshold_; + Gc* gc_; - /* - * \brief sets negative threshold for 2bit gradient compression - */ - float neg_threshold_; }; /** @@ -505,9 +489,14 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { + + if (gc_->get_active_type() != GC_NONE) { + return ReduceCompressed(key, src, priority); + } + // avoid extra copy for single device, but it may bring problems for // abnormal usage of kvstore - if (src.size() == 1 && compress_ == "none") { + if (src.size() == 1) { return src[0]; } @@ -524,80 +513,78 @@ class CommDevice : public Comm { auto& buf = merge_buf_[key]; std::vector reduce(src.size()); - - if (compress_ == "none") { - CopyFromTo(src[0], &(buf.merged), priority); - reduce[0] = buf.merged; - - if (buf.copy_buf.empty()) { - // TODO(mli) this results in large device memory usage for huge ndarray, - // such as the largest fullc in VGG. consider to do segment reduce with - // NDArray.Slice or gpu direct memory access. for the latter, we need to - // remove some ctx check, and also it reduces 20% perf - buf.copy_buf.resize(src.size()-1); - for (size_t i = 0; i < src.size()-1; ++i) { - buf.copy_buf[i] = NDArray( - buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); - } - } + CopyFromTo(src[0], &(buf.merged), priority); + reduce[0] = buf.merged; + if (buf.copy_buf.empty()) { + // TODO(mli) this results in large device memory usage for huge ndarray, + // such as the largest fullc in VGG. consider to do segment reduce with + // NDArray.Slice or gpu direct memory access. for the latter, we need to + // remove some ctx check, and also it reduces 20% perf + buf.copy_buf.resize(src.size()-1); for (size_t i = 0; i < src.size()-1; ++i) { - CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority); - reduce[i+1] = buf.copy_buf[i]; + buf.copy_buf[i] = NDArray( + buf.merged.shape(), buf.merged.ctx(), false, buf.merged.dtype()); } - } else { - if (buf.copy_buf.empty()) { - // one buf for each context - buf.copy_buf.resize(src.size()); - buf.small_recv_buf.resize(src.size()); - buf.small_send_buf.resize(src.size()); - buf.residual.resize(src.size()); - - for (size_t i = 0; i < src.size(); ++i) { - buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx(), - false, buf.merged.dtype()); - buf.residual[i] = NDArray(buf.merged.shape(), src[i].ctx(), - false, buf.merged.dtype()); - buf.residual[i] = 0; - if (compress_ == "2bit") { - int bits = 16; - int64_t small_size = buf.merged.shape().Size() % bits == 0 ? - buf.merged.shape().Size() / bits : - buf.merged.shape().Size() / bits + 1; - buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), - false, buf.merged.dtype()); - buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), - false, buf.merged.dtype()); - } else { - LOG(FATAL) << "Unsupported type of compression " << compress_; - } - } + } + for (size_t i = 0; i < src.size()-1; ++i) { + CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority); + reduce[i+1] = buf.copy_buf[i]; + } + ElementwiseSum(reduce, &buf.merged); + return buf.merged; + } + + const NDArray& ReduceCompressed(int key, const std::vector& src, + int priority) { + if (!inited_) { + std::vector devs; + for (const auto& a : src) { + devs.push_back(a.ctx()); + } + InitMergeBuffer(devs); + if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) { + EnableP2P(devs); } + } + + auto& buf = merge_buf_[key]; + std::vector reduce(src.size()); + if (buf.copy_buf.empty()) { + // one buf for each context + buf.copy_buf.resize(src.size()); + buf.small_recv_buf.resize(src.size()); + buf.small_send_buf.resize(src.size()); + buf.residual.resize(src.size()); for (size_t i = 0; i < src.size(); ++i) { - // compress before copy - // this is done even if the data is on same context as copy_buf because - // we don't want the training to be biased towards data on this GPU - if (compress_ == "2bit") { - Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), compress_, - neg_threshold_, pos_threshold_, priority); - if (buf.small_send_buf[i].ctx() != buf.small_recv_buf[i].ctx()) { - CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); - } else { - // avoid memory copy when they are on same context - buf.small_recv_buf[i] = buf.small_send_buf[i]; - } - // TODO (undo comment) - Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), - neg_threshold_, pos_threshold_, compress_, priority); - } else { - LOG(FATAL) << "Unsupported type of compression " << compress_; - } - reduce[i] = buf.copy_buf[i]; + buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx(), + false, buf.merged.dtype()); + buf.residual[i] = NDArray(buf.merged.shape(), src[i].ctx(), + false, buf.merged.dtype()); + buf.residual[i] = 0; + int64_t small_size = gc_->GetCompressedSize(buf.merged.shape().Size()); + buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), + false, buf.merged.dtype()); + buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), + false, buf.merged.dtype()); } } + for (size_t i = 0; i < src.size(); ++i) { + // compress before copy + // this is done even if the data is on same context as copy_buf because + // we don't want the training to be biased towards data on this GPU + gc_->Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), priority); + if (buf.small_send_buf[i].ctx() != buf.small_recv_buf[i].ctx()) { + CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); + } else { + // avoid memory copy when they are on same context + buf.small_recv_buf[i] = buf.small_send_buf[i]; + } + gc_->Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), priority); + reduce[i] = buf.copy_buf[i]; + } ElementwiseSum(reduce, &buf.merged); - return buf.merged; } diff --git a/src/kvstore/gc.cpp b/src/kvstore/gc.cpp new file mode 100644 index 000000000000..68ea2ad0214f --- /dev/null +++ b/src/kvstore/gc.cpp @@ -0,0 +1,5 @@ +// +// Created by Huilgol, Rahul on 11/1/17. +// + +#include "gc.h" diff --git a/src/kvstore/gc.h b/src/kvstore/gc.h new file mode 100644 index 000000000000..e3a3b28efc0f --- /dev/null +++ b/src/kvstore/gc.h @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gc.h + * \brief Gradient compression for kvstore + * \author Rahul Huilgol + */ + +#ifndef MXNET_KVSTORE_GC_H +#define MXNET_KVSTORE_GC_H +#include +#include +#include + +#include +#include +#include + +// TODO check if it returns empty between two delims +template +void split(const std::string &s, const char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } +} + +enum CompressionType { + GC_NONE, GC_TWO_BIT +}; + +class Gc { + public: + Gc() { + type_ = GC_NONE; + active_ = false; + } + + virtual ~Gc() { } + + void SetParams(const std::string& compression_type, const float threshold) { + if (compression_type == "2bit") { + SetTwoBitCompression(threshold); + } + } + + void set_active() { + active_ = true; + } + + bool get_active_type() { + if (active_) return type_; + else return GC_NONE; + } + + void SetTwoBitCompression(const float threshold) { + type_ = GC_TWO_BIT; + threshold_ = threshold; + } + + std::string EncodeParams() { + std::string rval = std::to_string(type_); + if (type_ == GC_TWO_BIT) { + rval += "," + std::to_string(threshold_); + } + return rval; + } + + void DecodeParams(const std::string& s) { + std::vector elems; + split(s, ',', std::back_inserter(elems)); + type_ = static_cast(stoi(elems[0])); + if (elems.size()>1) { + if (!elems[1].empty()) { + threshold_ = stof(elems[1]); + } + } + } + + int GetCompressionFactor() { + if (type_ == GC_TWO_BIT) { + return 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + return 0; + } + } + + int64_t GetCompressedSize(const int64_t original_size){ + const int bits = GetCompressionFactor(); + return ((original_size % bits == 0) ? + original_size / bits : + original_size / bits + 1); + } + + void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported quantization of type " << type_; + } + } + + void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + const int a = from.ctx().dev_mask(); + const int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported dequantization of type " << type_; + } + } + + CompressionType type_; + + bool active_; + + float threshold_ = 0; + +private: + +}; + + +#endif //MXNET_KVSTORE_GC_H diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 2377d67018ab..49ffcd9a0ce0 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -63,6 +63,7 @@ class KVStoreDist : public KVStoreLocal { } virtual ~KVStoreDist() { + delete gc_; Engine::Get()->WaitForAll(); if (IsWorkerNode()) { if (barrier_before_exit_) { @@ -86,12 +87,12 @@ class KVStoreDist : public KVStoreLocal { } } - void SetCompress(const std::string& compress, const float neg_threshold, - const float pos_threshold) override { - KVStoreLocal::SetCompress(compress, neg_threshold, pos_threshold); + void SetGradientCompression(const std::string& compression_type, const float threshold) override { + KVStoreLocal::SetGradientCompression(compression_type, threshold); if (get_rank() == 0) { - SendCommandToServers(kSetCompress, compress_); + SendCommandToServers(kSetGradientCompression, gc_->EncodeParams()); } + //TODO barrier? } void Barrier() override { @@ -140,6 +141,36 @@ class KVStoreDist : public KVStoreLocal { } private: + /** + * \brief struct for ps keys and lens + */ + struct PSKV { + ps::SArray keys; // n keys + ps::SArray lens; // the length of the i-th value + int size; + }; + + struct ComprPSKV { + PSKV push; + PSKV pull; + }; + + /** + * \brief cache all key partitions + * + * `ps_kv_` is used for row sparse + * + * `push_ps_kv_` and `pull_ps_kv_`, used for default type gradients, are same + * when there is no gradient compression + */ + std::unordered_map ps_kv_; + std::unordered_map compr_ps_kv_; + + /** + * \brief serialize access to ps_kv_ or push_ps_kv_/pull_ps_kv_ while encoding keys + */ + std::mutex mu_; + void InitImpl(const std::vector& keys, const std::vector& values) override { CheckUnique(keys); @@ -153,6 +184,8 @@ class KVStoreDist : public KVStoreLocal { comm_buf_[key].WaitToWrite(); compr_buf_[key].WaitToWrite(); } + gc_->set_active(); + } else { // do nothing } @@ -191,7 +224,9 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = EncodeKey(key, size, false); + PSKV& pskv = (gc_->get_active_type() == GC_NONE) ? + EncodeDefaultKey(key, size, false) : + EncodeCompressedKey(key, size, false); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(recv_buf.data()); #endif @@ -199,8 +234,9 @@ class KVStoreDist : public KVStoreLocal { // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); // issue pull + int cmd = (gc_->get_active_type() == GC_NONE) ? kDefaultPushPull : kCompressedPushPull; CHECK_NOTNULL(ps_worker_)->ZPull( - pskv.keys, vals, &pskv.lens, kDefaultPushPull, [vals, cb](){ delete vals; cb(); }); + pskv.keys, vals, &pskv.lens, cmd, [vals, cb](){ delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( @@ -210,7 +246,7 @@ class KVStoreDist : public KVStoreLocal { {recv_buf.var()}, FnProperty::kNormal, priority, - PROFILER_MESSAGE("KVStoreDistDefaultPull")); + PROFILER_MESSAGE("KVStoreDistDefaultStoragePull")); comm_->Broadcast(key, recv_buf, grouped_vals[i], priority); } @@ -272,47 +308,12 @@ class KVStoreDist : public KVStoreLocal { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; -// if(count_save.count(key)==0) { -// count_save[key] = 0; -// } else { -// count_save[key]++; -// } - const auto& vals = grouped_vals[i]; -// if(count_save[key]<3 && get_rank()==0){ -// {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals0_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// vals[0].WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {vals[0]},{}); -// } -// if(vals.size()>=2) { -// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals1_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// vals[1].WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {vals[1]},{}); -// } -// -// if(vals.size()>=3) { -// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals2_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// vals[2].WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {vals[2]},{}); -// } -// if(vals.size()>=4) {std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_vals3_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// vals[3].WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {vals[3]},{}); -// } -// } - NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; const auto storage_type = merged.storage_type(); auto &comm_buf = comm_buf_[key]; if (merged.ctx().dev_mask() == cpu::kDevMask) { - -// if(count_save[key]<3 && get_rank()==0) { -// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_merged_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// merged.WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {merged},{}); -// } - // Start of a push doesn't guarantee that the previous pushes are completed. // This shouldn't affect training of networks though because training involves // a sequence of push, pull, then push. This imposes ordering that the @@ -328,58 +329,65 @@ class KVStoreDist : public KVStoreLocal { } CopyFromTo(merged, &comm_buf); } - -// if(count_save[key]<3 && get_rank()==0) { -// std::unique_ptr fo(dmlc::Stream::Create((compress_ + "grad_data_count" + std::to_string(count_save[key]) + "_key" + std::to_string(key)).c_str(), "w")); -// comm_buf.WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {comm_buf},{}); -// } - - if (compress_ != "none") { - auto &small_buf = compr_buf_[key]; - auto &res_buf = residual_[key]; - size_t original_size = comm_buf.shape().Size(); - // returns push_pskv - PSKV &pskv = EncodeCompressedKey(key, original_size, true); - // Init the small buffer and residual_ buffer for quantize - if (small_buf.is_none()) { - if (storage_type == kDefaultStorage) { - // small buffer for quantize - small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); - // residual buffer for quantize - res_buf = NDArray(TShape{(int64_t) original_size}, comm_buf.ctx(), - false, comm_buf.dtype()); - res_buf = 0; - } else { - LOG(FATAL) << "compression for non default storage type unsupported"; - } - } - if (compress_ == "2bit") { - Quantize(comm_buf, &small_buf, &res_buf, - compress_, neg_threshold_, pos_threshold_, priority);//neg_threshold_, pos_threshold_, priority); + // push to servers + if (storage_type == kDefaultStorage) { + if (gc_->get_active_type() == GC_NONE) { + PushDefault(key, comm_buf, priority); } else { - LOG(FATAL) << "Unsupported quantization"; + PushCompressed(key, comm_buf, priority); } - - if (storage_type == kDefaultStorage) { - PushCompressed(key, comm_buf, small_buf, pskv, priority); - } else { - LOG(FATAL) << "compression for non default storage type unsupported"; + } else if (storage_type == kRowSparseStorage) { + if (gc_->get_active_type() != GC_NONE) { + LOG(FATAL) << "Gradient compression for row sparse storage type is not supported"; } + PushRowSparse(key, comm_buf, priority); } else { - // push to servers - if (storage_type == kDefaultStorage) { - PushDefault(key, comm_buf, priority); - } else if (storage_type == kRowSparseStorage) { - PushRowSparse(key, comm_buf, priority); - } else { - LOG(FATAL) << "unknown storage type"; - } + LOG(FATAL) << "unknown storage type"; } } } + void PushCompressed(int key, const NDArray& comm_buf, int priority) { + auto &small_buf = compr_buf_[key]; + auto &res_buf = residual_[key]; + size_t original_size = comm_buf.shape().Size(); + + // returns push_pskv but calculates both push and pull pskv + PSKV &pskv = EncodeCompressedKey(key, original_size, true); + + // Init the small buffer and residual_ buffer for quantize + if (small_buf.is_none()) { + small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); + res_buf = NDArray(TShape{(int64_t) original_size}, comm_buf.ctx(), + false, comm_buf.dtype()); + res_buf = 0; + } + gc_->Quantize(comm_buf, &small_buf, &res_buf, priority); + auto push_to_servers = + [this, key, pskv, comm_buf, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + size_t size = small_buf.shape().Size(); + real_t* data = small_buf.data().dptr(); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(small_buf.data()); +#endif + // do push. false means no delete + ps::SArray vals(data, size, false); + CHECK_NOTNULL(ps_worker_)->ZPush( + pskv.keys, vals, pskv.lens, kCompressedPushPull, [cb]() { cb(); }); + }; + // acquire locks on both comm_buf and small_buf so that + // pull (which uses comm_buf) for the same key waits till push finishes + Engine::Get()->PushAsync( + push_to_servers, + pinned_ctx_, + {small_buf.var(), comm_buf.var()}, + {}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistCompressedPush")); + } + void PushDefault(int key, const NDArray &send_buf, int priority) { auto push_to_servers = [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { @@ -405,48 +413,6 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistDefaultPush")); } - // pull row sparse weight into `recv_buf` based on indices given by `indices` - void PullRowSparse_(const int key, const NDArray& recv_buf, - const NDArray& indices, int priority) { - using namespace rowsparse; - auto pull_from_servers = [this, key, recv_buf, indices] - (RunContext rctx, Engine::CallbackOnComplete cb) { - // allocate memory for the buffer - size_t num_rows = indices.shape().Size(); - recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif - real_t* data = recv_buf.data().dptr(); - const auto offsets = indices.data().dptr(); - const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); - const int64_t size = num_rows * unit_len; - // convert to ps keys in row sparse format - PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, - unit_len, recv_buf.shape()[0]); - if (this->log_verbose_) { - LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: " - << pskv.keys << " size: " << size; - } - auto vals = new ps::SArray(data, size, false); - // copy indices to recv_buf. this needs to be done before ZPull - // because after pull is done, the callback function returns and locks are released. - // at this point, later functions may access the indices variable while copy happens - mshadow::Copy(recv_buf.aux_data(kIdx).FlatTo1D(), - indices.data().FlatTo1D()); - CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull, - [vals, cb]() { delete vals; cb(); }); - }; - CHECK_NOTNULL(Engine::Get())->PushAsync( - pull_from_servers, - pinned_ctx_, - {indices.var()}, - {recv_buf.var()}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistRowSparsePull")); - } - // push row sparse gradient void PushRowSparse(int key, const NDArray &send_buf, int priority) { using namespace rowsparse; @@ -483,6 +449,49 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistRowSparsePush")); } + + // pull row sparse weight into `recv_buf` based on indices given by `indices` + void PullRowSparse_(const int key, const NDArray& recv_buf, + const NDArray& indices, int priority) { + using namespace rowsparse; + auto pull_from_servers = [this, key, recv_buf, indices] + (RunContext rctx, Engine::CallbackOnComplete cb) { + // allocate memory for the buffer + size_t num_rows = indices.shape().Size(); + recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(recv_buf.data()); +#endif + real_t* data = recv_buf.data().dptr(); + const auto offsets = indices.data().dptr(); + const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); + const int64_t size = num_rows * unit_len; + // convert to ps keys in row sparse format + PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets, + unit_len, recv_buf.shape()[0]); + if (this->log_verbose_) { + LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: " + << pskv.keys << " size: " << size; + } + auto vals = new ps::SArray(data, size, false); + // copy indices to recv_buf. this needs to be done before ZPull + // because after pull is done, the callback function returns and locks are released. + // at this point, later functions may access the indices variable while copy happens + mshadow::Copy(recv_buf.aux_data(kIdx).FlatTo1D(), + indices.data().FlatTo1D()); + CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull, + [vals, cb]() { delete vals; cb(); }); + }; + CHECK_NOTNULL(Engine::Get())->PushAsync( + pull_from_servers, + pinned_ctx_, + {indices.var()}, + {recv_buf.var()}, + FnProperty::kNormal, + priority, + PROFILER_MESSAGE("KVStoreDistRowSparsePull")); + } + /** * \brief check if the keys are all unique */ @@ -494,89 +503,62 @@ class KVStoreDist : public KVStoreLocal { } /** - * \brief struct for ps keys and lens - */ - struct PSKV { - ps::SArray keys; // n keys - ps::SArray lens; // the length of the i-th value - int size; - }; - - /** - * \brief cache all key partitions - * - * `ps_kv_` is used for row sparse - * - * `push_ps_kv_` and `pull_ps_kv_`, used for default type gradients, are same - * when there is no gradient compression - */ - std::unordered_map ps_kv_; - std::unordered_map push_ps_kv_; - std::unordered_map pull_ps_kv_; - - /** - * \brief serialize access to ps_kv_ or push_ps_kv_/pull_ps_kv_ while encoding keys + * \brief convert to keys in ps */ - std::mutex mu_; - - void PushCompressed(int key, const NDArray& comm_buf, const NDArray &small_buf, - const PSKV& pskv, int priority) { - auto push_to_servers = - [this, key, comm_buf, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { - size_t size = small_buf.shape().Size(); - real_t* data = small_buf.data().dptr(); - #if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(small_buf.data()); - #endif - // do push. false means no delete - ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kCompressedPushPull, [cb]() { cb(); }); - }; - // acquire locks on both comm_buf and small_buf so that pull (which uses comm_buf) - // for the same key waits till push finishes - Engine::Get()->PushAsync( - push_to_servers, - pinned_ctx_, - {small_buf.var(), comm_buf.var()}, - {}, - FnProperty::kNormal, - priority, - PROFILER_MESSAGE("KVStoreDistCompressedPush")); - } - - - - PSKV& EncodeKey(int key, size_t size, bool is_push) { - if (compress_ != "none") { - return EncodeCompressedKey(key, size, is_push); + inline PSKV& EncodeDefaultKey(int key, size_t size, bool is_push) { + mu_.lock(); + PSKV& pskv = ps_kv_[key]; + mu_.unlock(); + if (!pskv.keys.empty()) { + CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; } else { - return EncodeDefaultKey(key, size, is_push); + auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); + int num_servers = krs.size(); + CHECK_GT(num_servers, 0); + + // a simple heuristic for load balance + if (size < bigarray_bound_) { + // send it to a single random picked server + int server = (key * 9973) % num_servers; + ps::Key ps_key = krs[server].begin() + key; + CHECK_LT(ps_key, krs[server].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(size); + pskv.size = size; + } else { + // parition it to all servers + pskv.size = 0; + for (int i = 0; i < num_servers; ++i) { + size_t part_size = + static_cast(round(static_cast(size)/num_servers*(i+1))) - + static_cast(round(static_cast(size)/num_servers*i)); + ps::Key ps_key = krs[i].begin() + key; + CHECK_LT(ps_key, krs[i].end()); + pskv.keys.push_back(ps_key); + pskv.lens.push_back(part_size); + pskv.size += part_size; + } + CHECK_EQ(static_cast(pskv.size), size); + } } + return pskv; } /** * \brief Convert to keys in ps for compressed values * Divides original array into equal parts for each server - * with 3 floats space for meta info + * Populates both push and pull pskv on first call */ inline PSKV& EncodeCompressedKey(int key, size_t original_size, bool is_push) { auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); int num_servers = krs.size(); CHECK_GT(num_servers, 0); - int bits; - if (compress_ == "2bit") { - bits = 16; - } else { - LOG(FATAL) << "Unsupported compression type"; - } // represents size of data to be sent - size_t compr_size = original_size % bits == 0 ? - original_size / bits: original_size / bits + 1; + size_t compr_size = gc_->GetCompressedSize(original_size); mu_.lock(); - PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; + PSKV& pskv = (is_push) ? compr_ps_kv_[key].push : compr_ps_kv_[key].pull; mu_.unlock(); if (!pskv.keys.empty()) { @@ -587,8 +569,8 @@ class KVStoreDist : public KVStoreLocal { // push pskv has sizes corresponding to compressed data // pull pskv has decompressed sizes for parts in push_pskv mu_.lock(); - PSKV& pull_pskv = pull_ps_kv_[key]; - PSKV& push_pskv = push_ps_kv_[key]; + PSKV& pull_pskv = compr_ps_kv_[key].pull; + PSKV& push_pskv = compr_ps_kv_[key].push; mu_.unlock(); if (original_size < bigarray_bound_) { @@ -620,9 +602,8 @@ class KVStoreDist : public KVStoreLocal { } else { part_compr = static_cast (round(static_cast(compr_size)/num_servers*(i+1))) - static_cast (round(static_cast(compr_size)/num_servers*(i))); - part_orig = part_compr * bits; + part_orig = part_compr * gc_->GetCompressionFactor(); } - // TODO(huilgolr) specific to 2bit compression. generalize // meta info ps::Key ps_key_dummy = krs[i].begin() + part_orig; @@ -650,48 +631,6 @@ class KVStoreDist : public KVStoreLocal { return pskv; } - /** - * \brief convert to keys in ps - */ - inline PSKV& EncodeDefaultKey(int key, size_t size, bool is_push) { - mu_.lock(); - PSKV& pskv = (is_push) ? push_ps_kv_[key] : pull_ps_kv_[key]; - mu_.unlock(); - if (!pskv.keys.empty()) { - CHECK_EQ(static_cast(pskv.size), size) << "The value size cannot be changed"; - } else { - auto krs = ps::Postoffice::Get()->GetServerKeyRanges(); - int num_servers = krs.size(); - CHECK_GT(num_servers, 0); - - // a simple heuristic for load balance - if (size < bigarray_bound_) { - // send it to a single random picked server - int server = (key * 9973) % num_servers; - ps::Key ps_key = krs[server].begin() + key; - CHECK_LT(ps_key, krs[server].end()); - pskv.keys.push_back(ps_key); - pskv.lens.push_back(size); - pskv.size = size; - } else { - // parition it to all servers - pskv.size = 0; - for (int i = 0; i < num_servers; ++i) { - size_t part_size = - static_cast(round(static_cast(size)/num_servers*(i+1))) - - static_cast(round(static_cast(size)/num_servers*i)); - ps::Key ps_key = krs[i].begin() + key; - CHECK_LT(ps_key, krs[i].end()); - pskv.keys.push_back(ps_key); - pskv.lens.push_back(part_size); - pskv.size += part_size; - } - CHECK_EQ(static_cast(pskv.size), size); - } - } - return pskv; - } - // Note: this encoding method for row sparse keys doesn't allow cross-layer batching inline PSKV& EncodeRowSparseKey(const int key, const int64_t size, const int64_t num_rows, const int64_t *offsets, const size_t unit_len, @@ -783,12 +722,12 @@ class KVStoreDist : public KVStoreLocal { /** * \brief residual buffer to accumulate quantization error + * during gradient compression */ std::unordered_map residual_; bool log_verbose_; -// std::unordered_map count_save; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 6253d55dbe27..908109798fd7 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -45,7 +45,7 @@ static const int kDefaultPushPull = 0; static const int kCompressedPushPull = 3; static const int kStopServer = -1; static const int kSyncMode = -2; -static const int kSetCompress = 2; +static const int kSetGradientCompression = 2; /** * \brief executor runs a function using the thread called \ref Start @@ -119,11 +119,13 @@ class KVStoreDistServer { ps_server_->set_request_handle( std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; + gc_ = new Gc(); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } ~KVStoreDistServer() { delete ps_server_; + delete gc_; } void set_controller(const KVStore::Controller& controller) { @@ -154,8 +156,8 @@ class KVStoreDistServer { exec_.Stop(); } else if (recved.head == kSyncMode) { sync_mode_ = true; - } else if (recved.head == kSetCompress) { - compress_ = recved.body; + } else if (recved.head == kSetGradientCompression) { + gc_->DecodeParams(recved.body); } else { // let the main thread to execute ctrl, which is necessary for python exec_.Exec([this, recved]() { @@ -365,15 +367,31 @@ class KVStoreDistServer { } } + void DefaultStorageResponse(int key, NDArray& stored, + const ps::KVMeta& req_meta, + const ps::KVPairs &req_data, + ps::KVServer* server) { + ps::KVPairs response; + CHECK(!stored.is_none()) << "init " << key << " first"; + auto len = stored.shape().Size(); + response.keys = req_data.keys; + response.lens = {len}; + // TODO(mli) try to remove this CopyFrom + response.vals.CopyFrom(static_cast(stored.data().dptr_), len); + server->Response(req_meta, response); + } + void DataHandleCompressed(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { CHECK_EQ(req_meta.cmd, kCompressedPushPull); - // do some check if (req_meta.push) { // there used several WaitToRead, this is because \a recved's memory // could be deallocated when this function returns. so we need to make sure // the operators with \a NDArray are actually finished + + // first for dummy key which represents original size of array, whose len is 0 + CHECK_EQ(req_data.keys.size(), (size_t)2); CHECK_EQ(req_data.lens.size(), (size_t)2); CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[1]); @@ -381,47 +399,24 @@ class KVStoreDistServer { int key = DecodeKey(req_data.keys[1]); auto& stored = store_[key]; -// if (count_save.count(key)==0) { -// count_save[key] = 0; -// } else { -// count_save[key]++; -// } - size_t ds[] = {(size_t)req_data.lens[1]}; TShape dshape(ds, ds + 1); - TBlob recv_blob((real_t*)req_data.vals.data(), // NOLINT(*) + TBlob recv_blob((real_t*) req_data.vals.data(), // NOLINT(*) dshape, cpu::kDevMask); NDArray recved = NDArray(recv_blob, 0); NDArray decomp_buf = decomp_buf_[key]; dshape = TShape{(int64_t) original_size}; - // TODO(huilgolr) check and merge with init of stored if (decomp_buf.is_none()) { decomp_buf = NDArray(dshape, Context()); } if (stored.is_none()) { - // initialization stored = NDArray(dshape, Context()); -// { -// std::unique_ptr fo( -// dmlc::Stream::Create((compress_ + "server_recved_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); -// recved.WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {recved}, {}); -// } - - Dequantize(recved, &stored, neg_threshold, pos_threshold, compress_, 0); - + gc_->Dequantize(recved, &stored, 0); server->Response(req_meta); stored.WaitToRead(); - -// { -// std::unique_ptr fo( -// dmlc::Stream::Create((compress_ + "server_stored_count" + std::to_string(count_save[key]) + "_" + std::to_string(key)).c_str(), "w")); -// stored.WaitToRead(); -// mxnet::NDArray::Save(fo.get(), {stored}, {}); -// } } else if (sync_mode_) { // synced push auto& merged = merge_buf_[key]; @@ -429,16 +424,16 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - Dequantize(recved, &merged.array, neg_threshold, pos_threshold, compress_, 0); + gc_->Dequantize(recved, &merged.array, 0); } else { - Dequantize(recved, &decomp_buf, neg_threshold, pos_threshold, compress_, 0); + gc_->Dequantize(recved, &decomp_buf, 0); merged.array += decomp_buf; } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); } else { // async push - Dequantize(recved, &decomp_buf, neg_threshold, pos_threshold, compress_, 0); + gc_->Dequantize(recved, &decomp_buf, 0); exec_.Exec([this, key, &decomp_buf, &stored]() { CHECK(updater_); updater_(key, decomp_buf, &stored); @@ -447,8 +442,10 @@ class KVStoreDistServer { stored.WaitToRead(); } } else { // pull - // never used - DataHandleDefault(req_meta, req_data, server); + CHECK_EQ(req_data.keys.size(), (size_t)1); + CHECK_EQ(req_data.lens.size(), (size_t)0); + int key = DecodeKey(req_data.keys[0]); + DefaultStorageResponse(key, store_[key], req_meta, req_data, server); } } @@ -504,15 +501,7 @@ class KVStoreDistServer { stored.WaitToRead(); } } else { - // pull - ps::KVPairs response; - CHECK(!stored.is_none()) << "init " << key << " first"; - auto len = stored.shape().Size(); - response.keys = req_data.keys; - response.lens = {len}; - // TODO(mli) try to remove this CopyFrom - response.vals.CopyFrom(static_cast(stored.data().dptr_), len); - server->Response(req_meta, response); + DefaultStorageResponse(key, stored, req_meta, req_data, server); } } @@ -554,15 +543,11 @@ class KVStoreDistServer { bool log_verbose_; /** - * \brief compress_ refers to whether values sent to kvstore server are - * in quantized form. It can be 'none' or '2bit' for now. This is set - * by worker by sending a command to server + * \brief gradient compression object. + * starts with none, used after SetGradientCompression sets the type + * currently there is no support for unsetting gradient compression */ - std::string compress_ = "none"; - - float pos_threshold = 0.5; - float neg_threshold = -0.5; -// std::unordered_map count_save; + Gc* gc_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index c177cb2eecc5..756a6f243e8b 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -58,10 +58,12 @@ class KVStoreLocal : public KVStore { comm_ = new CommCPU(); } pinned_ctx_ = comm_->pinned_ctx(); + gc_ = new Gc(); } virtual ~KVStoreLocal() { delete comm_; + delete gc_; } void Init(const std::vector& keys, @@ -135,14 +137,11 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } - void SetCompress(const std::string& compress, const float neg_threshold, - const float pos_threshold) override { - compress_ = compress; - pos_threshold_ = pos_threshold; - neg_threshold_ = neg_threshold; + void SetGradientCompression(const std::string& compression_type, const float threshold) override { + gc_->SetParams(compression_type, threshold); } - private: +private: virtual void InitImpl(const std::vector& keys, const std::vector& values) { for (size_t i = 0; i < keys.size(); ++i) { @@ -151,7 +150,9 @@ class KVStoreLocal : public KVStore { local_[keys[i]] = values[i].Copy(pinned_ctx_); comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } - comm_->SetCompress(compress_, neg_threshold_, pos_threshold_); + //TODO verify if comm destruction doesn't cause double free memory corruption + comm_->SetGradientCompression(gc_); + gc_->set_active(); } virtual void PushImpl(const std::vector& keys, @@ -387,6 +388,7 @@ class KVStoreLocal : public KVStore { std::unordered_set warnings_printed_; /// whether int or string is used for keys KeyType key_type_ = kUndefinedKey; + }; } // namespace kvstore } // namespace mxnet diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 16d4bc639820..fb2275b7077f 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -30,7 +30,6 @@ #include #include #include -#include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" @@ -558,89 +557,6 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { } } -void Quantize(const NDArray &from, NDArray *to, NDArray *residual, - const std::string& compress, const float neg_threshold, const float pos_threshold, - int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - if (a == cpu::kDevMask && b == cpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { -#if MXNET_USE_CUDA - if (a == gpu::kDevMask && b == gpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, residual, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::ndarray::Quantize2BitDispatch(ctx.get_stream(), inputs, - neg_threshold, pos_threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "Unsupported Quantization"; - } - } else { - LOG(FATAL) << "unknown device mask"; - } -#else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; -#endif - } -} - -void Dequantize(const NDArray &from, NDArray *to, - const float neg_threshold, const float pos_threshold, const std::string& compress, int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - if (a == cpu::kDevMask && b == cpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); - }, from.ctx(), {from.var()}, {to->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); - } else { - LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; - } - } else { -#if MXNET_USE_CUDA - if (a == gpu::kDevMask && b == gpu::kDevMask) { - if (compress == "2bit") { - Engine::Get()->PushSync([from, to, neg_threshold, pos_threshold](RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::ndarray::Dequantize2BitDispatch(ctx.get_stream(), inputs, neg_threshold, pos_threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var()}, - FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "Unsupported dequantization " << compress << std::endl; - } - } else { - LOG(FATAL) << "unknown device mask"; - } -#else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; -#endif - } -} - - void CopyFromTo(const NDArray& from, NDArray *to, int priority) { CopyFromTo(from, *to, priority); } diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 6da0f58a7259..810cf0e793c1 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -27,7 +27,6 @@ #include "./ndarray_function.h" #include "./ndarray_function-inl.h" #include "../common/utils.h" -#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace ndarray { @@ -185,23 +184,5 @@ void ElementwiseSum(mshadow::Stream* s, } } -/* - * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray - */ -template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); -} - -/* - * \brief Enables use of function defined under Quantize2Bit operator for an ndarray - */ -template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); -} - } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu index d5b519a836d9..8accc2b41cfd 100644 --- a/src/ndarray/ndarray_function.cu +++ b/src/ndarray/ndarray_function.cu @@ -32,7 +32,6 @@ #include "./ndarray_function.h" #include "./ndarray_function-inl.h" #include "./ndarray_function-inl.cuh" -#include "../operator/contrib/two_bit_quantize-inl.h" namespace mxnet { namespace ndarray { @@ -203,23 +202,5 @@ void ElementwiseSum(mshadow::Stream* s, } } -/* - * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray - */ -template<> -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - mxnet::op::Dequantize2BitImpl(s, inputs, neg_threshold, pos_threshold); -} - -/* - * \brief Enables use of function defined under Quantize2Bit operator for an ndarray - */ -template<> -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - mxnet::op::Quantize2BitImpl(s, inputs, neg_threshold, pos_threshold); -} - } // namespace ndarray } // namespace mxnet diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index be6eae81cd4a..404684a5878f 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -23,13 +23,12 @@ */ #ifndef MXNET_NDARRAY_NDARRAY_FUNCTION_H_ #define MXNET_NDARRAY_NDARRAY_FUNCTION_H_ - +#include #include #include #include #include #include -#include #include "../operator/mshadow_op.h" namespace mxnet { @@ -164,20 +163,6 @@ void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx); -/* - * \brief Enables use of function defined under Dequantize2Bit operator for an ndarray - */ -template -void Dequantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold); - -/* - * \brief Enables use of function defined under Quantize2Bit operator for an ndarray - */ -template -void Quantize2BitDispatch(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold); - template void ElementwiseSum(const std::vector source, TBlob *out, diff --git a/src/operator/contrib/two.rahul b/src/operator/contrib/two.rahul deleted file mode 100644 index 42640fd30af6..000000000000 --- a/src/operator/contrib/two.rahul +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - /*! - * \file two_bit_quantize-inl.h - * \brief implementation of quantize_2bit operation - */ -#ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ -#define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ - -#include -#include -#include -#include "../elemwise_op_common.h" -#include "../mshadow_op.h" -#include "../mxnet_op.h" - -namespace mxnet { -namespace op { - -struct init_mem_2bit { - // Initialize output array - MSHADOW_XINLINE static void Map(int i, float* out) { - *(out+i) = 0; - } -}; - -struct TwoBitParam : public dmlc::Parameter { - float pos_threshold, neg_threshold; - DMLC_DECLARE_PARAMETER(TwoBitParam) { - DMLC_DECLARE_FIELD(neg_threshold) - .set_default(-0.1) - .describe("Threshold to quantize negative values. " - "Has to be less than 0"); - DMLC_DECLARE_FIELD(pos_threshold) - .set_default(0.1) - .describe("Threshold to quantize positive values. " - "Has to be greater than 0"); - } -}; - -template -void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - // For now, this method can only compress the float data - mshadow::Stream *s = ctx.get_stream(); - // Init the memory of output to 0x00000000 - mxnet_op::Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr()); // compressed array -} - -inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - CHECK_EQ(in_attrs->size(), 1U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); - // check input - CHECK(!shape_is_none(in_attrs->at(0))); - // output - int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 3: - in_attrs->at(0).Size() / 16 + 4; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); - return true; -} - -inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - CHECK_EQ(in_attrs->size(), 1U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`create_2bit_` only supports float32 input for now"; - TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); - return true; -} - -struct init_threshold_2bit { - MSHADOW_XINLINE static void Map(int i, - float *out, - const float neg_threshold, - const float pos_threshold, - int size) { - // The first two elements in output are thresholds - // The third element is the original size of the array - out[0] = neg_threshold; - out[1] = pos_threshold; - // TODO(huilgolr) check potential problem here? - out[2] = static_cast(size); - } -}; - -struct quantize_2bit { - MSHADOW_XINLINE static void Map(int block_id, - int grad_size, - float *out, - float *grad, - float *residual, - const float neg_threshold, - const float pos_threshold) { - float* compr_block = out + block_id; - // init to 0 - *compr_block = 0; - // start and end are indices in original grad array - int start = block_id*16; - int end = (start+16 <= grad_size) ? start+16 : grad_size; - char* block_ptr = reinterpret_cast < char* > (compr_block); - for (int i=start; i < end; i++){ - char* curr_byte = block_ptr + (i-start)/4; - float curr_value = grad[i] + residual[i]; - if (curr_value >= pos_threshold) { - residual[i] = curr_value - pos_threshold; - // set data to 10 - (*curr_byte) |= (2u << (6-((i%4)*2))); - } else if (curr_value <= neg_threshold) { - residual[i] = curr_value - neg_threshold; - // set data to 01 - (*curr_byte) |= (1u << (6-((i%4)*2))); - } else { - // leave data as 00 - residual[i] = curr_value; - } - } - } -}; - -template -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - // Init threshold and original size - mxnet_op::Kernel::Launch(s, 1, - inputs[2].dptr(), // compressed array - neg_threshold, pos_threshold, - inputs[0].Size()); - // Finally, compress the data and calculate new residual - mxnet_op::Kernel::Launch(s, inputs[2].Size()-3, - inputs[0].Size(), // original grad size - inputs[2].dptr()+3, // compressed array - inputs[0].dptr(), // input array - inputs[1].dptr(), // residual array - neg_threshold, // negative threshold - pos_threshold); // positive threshold -} - -// this function has been defined as quantize_2bit operator -template -void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); -} - -inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - // 1. residual array - // 2. compressed array - CHECK_EQ(in_attrs->size(), 3U); - CHECK(!shape_is_none(in_attrs->at(0))); - CHECK(!shape_is_none(in_attrs->at(1))); - CHECK_EQ(in_attrs->at(0).Size(), - in_attrs->at(1).Size()); - int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 + 3: - in_attrs->at(0).Size() / 16 + 4; - CHECK_EQ(in_attrs->at(2).Size(), shape) - << "The size of output array is not equal to " - << "the size of compressed array"; - return true; -} - -inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - // 1. residual array - // 2. compressed array - CHECK_EQ(in_attrs->size(), 3U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - return true; -} - -struct dequantize_2bit { - // Decompress - MSHADOW_XINLINE static void Map(int i, - float *out, - float *in, - float *neg_threshold, - float *pos_threshold) { - // get block ptr - int block_id = i / 16; - char* ch_ptr = reinterpret_cast(in+block_id); - // get row ptr - int row_id = (i%16)/4; - ch_ptr += row_id; - // get column id - int col_id = (i%16)%4; - // Decompress - switch (col_id) { - case 0: - // positve - if (((*ch_ptr) & (0xc0)) == 0x80) { // binary: (10)00 0000 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0xc0)) == 0x40) { // binary: (01)00 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 1: - // positve - if (((*ch_ptr) & (0x30)) == 0x20) { // binary: 00(10) 0000 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0x30)) == 0x10) { // binary: 00(01) 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 2: - // positve - if (((*ch_ptr) & (0x0c)) == 0x08) { // binary: 00(10) 0000 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0x0c)) == 0x04) { // binary: 00(01) 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - case 3: - // positve - if (((*ch_ptr) & (0x03)) == 0x02) { // binary: 00(10) 0000 - out[i] = *pos_threshold; - // negative - } else if (((*ch_ptr) & (0x03)) == 0x01) { // binary: 00(01) 0000 - out[i] = *neg_threshold; - } else { // 0 - out[i] = 0; - } - break; - default: - break; - } - } -}; - -template -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs) { - // Can only decompress the float32 data - mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr()+3, // compressed array - inputs[0].dptr(), // negative threshold - inputs[0].dptr()+1); // positive threshold -} - -template -void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - Dequantize2BitImpl(s, inputs); -} - -inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. compressed array - // 1. original array - CHECK_EQ(in_attrs->size(), 2U); - // No output - CHECK_EQ(out_attrs->size(), 0U); - // check input - CHECK(!shape_is_none(in_attrs->at(0))); - CHECK(!shape_is_none(in_attrs->at(1))); - // TODO(huilgolr) check - CHECK_LE(in_attrs->at(1).Size(), - in_attrs->at(0).Size()*16) - << "The shape of the second input array are " - << "not equal to the original array."; - return true; -} - -inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. compressed array - // 1. original array - CHECK_EQ(in_attrs->size(), 2U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`dequantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "`dequantize_2bit_` only supports float32 input for now"; - return true; -} - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 48e345a63848..ce9e7b9ce943 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -17,23 +17,23 @@ * under the License. */ - /*! - * \file two_bit_quantize-inl.h - * \brief implementation of quantize_2bit operation - */ +/*! + * \file two_bit_quantize-inl.h + * \brief implementation of quantize_2bit operation + * \author Chao Ma, Rahul Huilgol + */ + #ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ #define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ #include -#include #include #include -#include "../elemwise_op_common.h" -#include "../mshadow_op.h" -#include "../mxnet_op.h" #include +#include #include -#include "ps/ps.h" -#include +#include +#include +#include namespace mxnet { namespace op { @@ -46,16 +46,12 @@ struct init_mem_2bit { }; struct TwoBitParam : public dmlc::Parameter { - float pos_threshold, neg_threshold; + float threshold; DMLC_DECLARE_PARAMETER(TwoBitParam) { - DMLC_DECLARE_FIELD(neg_threshold) - .set_default(-0.5) - .describe("Threshold to quantize negative values. " - "Has to be less than 0"); - DMLC_DECLARE_FIELD(pos_threshold) + DMLC_DECLARE_FIELD(threshold) .set_default(0.5) - .describe("Threshold to quantize positive values. " - "Has to be greater than 0"); + .describe("Threshold to quantize values. " + "Must be greater than 0"); } }; @@ -145,15 +141,14 @@ struct quantize_2bit { template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { - - mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array - inputs[0].Size(), + const float threshold) { + mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size inputs[2].dptr(), // compressed array - inputs[0].dptr(), // input array + inputs[0].dptr(), // original array inputs[1].dptr(), // residual array - neg_threshold, // negative threshold - pos_threshold); // positive threshold + -1 * threshold, // negative threshold + threshold); // positive threshold } // this function has been defined as quantize_2bit operator @@ -165,7 +160,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); + Quantize2BitImpl(s, inputs, param.threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -216,7 +211,6 @@ struct dequantize_2bit { float* outval = out + i; char* ch_ptr = reinterpret_cast(in + (i>>4)); -// std::cout<(*ch_ptr)<<" " <(*(ch_ptr+1))<<" "<(*(ch_ptr+2))<<" "<(*(ch_ptr+3))<> 2 ); const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; @@ -226,14 +220,11 @@ struct dequantize_2bit { uint8_t masked = *ch_ptr & mask; if ( masked == mask ) { *outval = pos_threshold; -// std::cout<(*ch_ptr)<< " "<(masked)<< " "<(*ch_ptr)<< " "<(masked)<< " "<(*ch_ptr)<< " "<(masked)<< " 0"< void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float neg_threshold, const float pos_threshold) { + const float threshold) { mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr(), // compressed array - neg_threshold, // negative threshold - pos_threshold); // positive threshold + -1*threshold, // negative threshold + threshold); // positive threshold } template @@ -257,7 +248,7 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Dequantize2BitImpl(s, inputs, param.neg_threshold, param.pos_threshold); + Dequantize2BitImpl(s, inputs, param.threshold); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index b2086d88cf98..8f215c6f340c 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -22,6 +22,7 @@ * \brief registers quantize_2bit, dequantize_2bit * and create_2bit operators with nnvm */ +#include #include "./two_bit_quantize-inl.h" namespace mxnet { @@ -33,14 +34,12 @@ NNVM_REGISTER_OP(_contrib_quantize_2bit) .describe(R"code(Quantize an input tensor into using 2bits for each value using user-specified thresholds, while storing quantization error in residual array. -The quantize_2bit operator takes 5 arguments and is called as follows: -`quantize_2bit(array, residual, out, neg_threshold, pos_threshold)`. +The quantize_2bit operator takes 4 arguments and is called as follows: +`quantize_2bit(array, residual, out, threshold)`. The operator modifies `residual` and `out` arrays. The `out`variable will be the quantized array. Note that, `out` array can be generated by invoking `create_2bit(array)`, avoiding calculation of size of quantized array. -This `out` array has first three elements as negative threshold, positive threshold, -and size of the original uncompressed array. Any elements after these three elements -represent quantized data. +This `out` array represents quantized data. The operation sums up array and residual, and then applies the thresholds to quantize the data into one of three states represented by 2bits. 16 such quantized floats in the original array @@ -48,10 +47,10 @@ are packed together into one float in the `out` array. The quantization error is stored in residual array. For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the -residual is [0.0, -2.0, 0, 1.0]. Let the negative and positive thresholds be --4.0 and +4.0, respectively. In this method, the elements whose -(gradient + residual) >= pos_threshold will be quantized into 2-bits '01', -and the elements whose (gradient + residual) <= neg_threshold will be +residual is [0.0, -2.0, 0, 1.0]. Let the threshold be +4.0. +In this method, the elements whose (gradient + residual) >= threshold +will be quantized into 2-bits '01', and the elements +whose (gradient + residual) <= -1*threshold will be quantized into 2-bits '10'. The other elements will be quantized as '00'. Every 16 floats in the original array will be packed into one float variable in the output array. @@ -92,11 +91,11 @@ two bit quantization. This array will be on the same context as input array. NNVM_REGISTER_OP(_contrib_dequantize_2bit) .describe(R"code(Dequantize an input tensor quantized by quantize_2bit. -The dequantize_2bit operator takes two input arguments. The first input is a NDArray, +The dequantize_2bit operator takes three input arguments. The first input is a NDArray, which has been generated by quantize_2bit(). The second input is a NDArray that has the same shape as the original array before quantizing. The operator replaces the contents of this array -with dequantized data. +with dequantized data. Third argument is threshold used to quantize the array. )code" ADD_FILELINE) .set_num_inputs(2) .set_num_outputs(0) diff --git a/tests/cpp/operator/quantize_perf.cc b/tests/cpp/operator/quantize_perf.cc index d6de53cee5bf..689f9f5d1686 100644 --- a/tests/cpp/operator/quantize_perf.cc +++ b/tests/cpp/operator/quantize_perf.cc @@ -95,7 +95,7 @@ static void RunCoreOpTimingTest(const bool isGPU, * \brief ActivationOp timing test for CPU */ TEST(OMP_TUNING, TimingCPU) { - RunCoreOpTimingTest(false, {}, "quantize_2bt", COREOP_BWD_OP_NAME_VALUE_NONE); + RunCoreOpTimingTest(false, {}, "quantize_2bit", COREOP_BWD_OP_NAME_VALUE_NONE); } #if MXNET_USE_CUDA == 1 diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index a374db319bf4..18c9e1b33fca 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -59,14 +59,13 @@ def init_kv(): return kv, my_rank, nworker def init_kv_compressed(kv): - pos_threshold = 0.5 - neg_threshold = -0.5 - kv.set_compress({'compress': '2bit', 'pos_threshold': pos_threshold, 'neg_threshold': neg_threshold}) + threshold = 0.5 + kv.set_gradient_compression({'compression': '2bit', 'threshold':threshold}) # init kv compression keys kv.init('11221', mx.nd.zeros(big_shape)) kv.init('112221', mx.nd.zeros(irregular_shape)) kv.init('1121', mx.nd.zeros(shape)) - return kv, pos_threshold, neg_threshold + return kv, threshold def test_sync_push_pull(): kv, my_rank, nworker = init_kv() @@ -184,7 +183,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): expected[row] = updated_val[row] check_diff_to_scalar(val, expected, rank=my_rank) - def check_compr_residual(kv, pos_threshold, nworker): + def check_compr_residual(kv, threshold, nworker): for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: # doesn't meet threshold kv.push(k, mx.nd.ones(s)*0.4) @@ -193,10 +192,10 @@ def check_compr_residual(kv, pos_threshold, nworker): check_diff_to_scalar(val, 0) # just meets threshold with residual - kv.push(k, mx.nd.ones(s)*(pos_threshold - 0.4)) + kv.push(k, mx.nd.ones(s)*(threshold - 0.4)) val2 = mx.nd.zeros(s) kv.pull(k,val2) - curval = pos_threshold * rate * nworker + curval = threshold * rate * nworker check_diff_to_scalar(val2, curval) # doesn't meet threshold @@ -206,22 +205,22 @@ def check_compr_residual(kv, pos_threshold, nworker): check_diff_to_scalar(val3, curval) # exceeds again - kv.push(k, mx.nd.ones(s)*(pos_threshold-0.2)) + kv.push(k, mx.nd.ones(s)*(threshold-0.2)) val4 = mx.nd.zeros(s) kv.pull(k,val4) - curval += pos_threshold*rate*nworker + curval += threshold*rate*nworker check_diff_to_scalar(val4, curval) # residual is 0 now - def check_compr_ones(kv, pos, nworker): + def check_compr_ones(kv, threshold, nworker): for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]: val = mx.nd.zeros(s) kv.pull(k, val) curval = val[0][0].asnumpy()[0] - kv.push(k,mx.nd.ones(s)*pos) + kv.push(k,mx.nd.ones(s)*threshold) val2 = mx.nd.zeros(s) kv.pull(k, val2) - newval = curval + rate*nworker*pos + newval = curval + rate*nworker*threshold check_diff_to_scalar(val2, newval) # residual = 0 again @@ -239,7 +238,7 @@ def check_compr_zero(kv): kv.pull(k, val) check_diff_to_scalar(val, 0) - def check_compr_random(kv, pos, neg, nworker): + def check_compr_random(kv, threshold, nworker): # set a seed so all workers generate same data. knowing this helps # calculate expected value after pull mx.random.seed(123) @@ -258,9 +257,9 @@ def check_compr_random(kv, pos, neg, nworker): diff = val - orig_val # compute expected by directly using operators compr = mx.contrib.nd.create_2bit(grad_cpy) - mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, neg, pos) + mx.contrib.ndarray.quantize_2bit(grad_cpy, mx.nd.zeros(s), compr, threshold) decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.dequantize_2bit(compr, decompr) + mx.contrib.ndarray.dequantize_2bit(compr, decompr, threshold) decompr *= nworker * rate assert_almost_equal(diff.asnumpy(), decompr.asnumpy()) @@ -273,12 +272,12 @@ def check_compr_random(kv, pos, neg, nworker): print('worker ' + str(my_rank) + ' is done with non compression tests') # don't run non compressed keys after this as kvstore now is set to compressed - kv, pos, neg = init_kv_compressed(kv) + kv, threshold = init_kv_compressed(kv) check_compr_pull_before_push(kv) check_compr_zero(kv) - check_compr_residual(kv, pos, nworker) - check_compr_ones(kv, pos, nworker) - check_compr_random(kv, pos, neg, nworker) + check_compr_residual(kv, threshold, nworker) + check_compr_ones(kv, threshold, nworker) + check_compr_random(kv, threshold, nworker) print('worker ' + str(my_rank) + ' is done with compression tests') def test_sync_init(): diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index de4f7ff66a7a..d9415865d06a 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -60,11 +60,11 @@ def test_kvstore(kv_type): err = sum(err) / np.sum(np.abs(res[j])) assert(err < 1e-6), (err, shapes[j]) -def test_compress_kvstore(kv_type, compress='2bit', neg=-0.5, pos=0.5): - print(kv_type + ' with ' + compress + ' compression') +def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5): + print(kv_type + ' with ' + compression + ' compression') rate = 2 kv = mx.kv.create(kv_type) - kv.set_compress({'compress':compress, 'neg_threshold':neg, 'pos_threshold':pos}) + kv.set_gradient_compression({'compression':compression, 'threshold':threshold}) kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) @@ -88,7 +88,7 @@ def push_zeros(kv): for o in out: assert_almost_equal(o.asnumpy(), exp) - def verify_residual(kv, neg_threshold, pos_threshold, rate): + def verify_residual(kv, threshold, rate): for j in range(len(keys)): kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*0.4 for g in range(nworker)]) out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] @@ -96,10 +96,10 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): for o in out: check_diff_to_scalar(o, 0) - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.3) for g in range(nworker)]) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(threshold-0.3) for g in range(nworker)]) out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j],out=out) - curval = pos_threshold * rate * nworker + curval = threshold * rate * nworker for o in out: check_diff_to_scalar(o, curval) @@ -109,10 +109,10 @@ def verify_residual(kv, neg_threshold, pos_threshold, rate): for o in out: check_diff_to_scalar(o, curval) - kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(pos_threshold-0.3) for g in range(nworker)]) + kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(threshold-0.3) for g in range(nworker)]) out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j],out=out) - curval += pos_threshold*rate*nworker + curval += threshold*rate*nworker for o in out: check_diff_to_scalar(o, curval) # residual would be 0 now @@ -129,7 +129,7 @@ def check_neg(kv, neg, rate, curval): check_diff_to_scalar(o, curval) # residual would be 0 again - def check_compr_random(kv, pos, neg): + def check_compr_random(kv, threshold): for j in range(len(keys)): orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] kv.pull(keys[j], out=orig_val) @@ -148,8 +148,8 @@ def check_compr_random(kv, pos, neg): for g in range(nworker): comprs.append(mx.contrib.nd.create_2bit(grads[g])) decomprs.append(mx.nd.zeros(grads[g].shape, ctx=mx.gpu(g))) - mx.contrib.ndarray.quantize_2bit(grads[g], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], neg, pos) - mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g]) + mx.contrib.ndarray.quantize_2bit(grads[g], mx.nd.zeros(shapes[j], ctx=mx.gpu(g)), comprs[g], threshold) + mx.contrib.ndarray.dequantize_2bit(comprs[g], decomprs[g], threshold) sum_dequantized_vals += ((decomprs[g]*rate).asnumpy()) for g in range(nworker): assert_almost_equal(diff[g].asnumpy(), sum_dequantized_vals) @@ -157,9 +157,9 @@ def check_compr_random(kv, pos, neg): pull_before_push(kv) push_zeros(kv) - curval = verify_residual(kv, neg, pos, rate) - check_neg(kv, neg, rate, curval) - check_compr_random(kv, pos, neg) + curval = verify_residual(kv, threshold, rate) + check_neg(kv, -1*threshold, rate, curval) + check_compr_random(kv, threshold) test_kvstore('local_update_cpu') test_kvstore('local_allreduce_cpu') diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index a73497030c33..1cb30c2cc009 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -4310,10 +4310,9 @@ def check(data, idx): assert (mx.nd.scatter_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [2, 3]]).all() def test_two_bit_quantization(): - neg_threshold = -0.5 - pos_threshold = 0.5 - orig_shape = [(25,),(16,),(1121),(14400),(144000),(144000)] - num_repeat = 10 + threshold = 0.5 + orig_shape = [(16,), (25,),(1121),(144000),(1440000)] + num_repeat = 2 from struct import pack,unpack def bits2int(bits): @@ -4326,7 +4325,7 @@ def bits2int(bits): def as_float32(s): return unpack("f",pack("I", bits2int(s)))[0] - def compute_expected(arr, neg, pos, curr_residual): + def compute_expected(arr, curr_residual, threshold): # str_quant stores the quantized representation as a sequence of bits str_quant = '' new_residual = [] @@ -4335,14 +4334,14 @@ def compute_expected(arr, neg, pos, curr_residual): curr_res_npy = curr_residual.asnumpy() for i, a in np.ndenumerate(arr_npy): a += curr_res_npy[i] - if a >= pos: + if a >= threshold: str_quant += '11' - new_residual.append(a - pos) - decompr.append(pos) - elif a <= neg: + new_residual.append(a - threshold) + decompr.append(threshold) + elif a <= (-1*threshold): str_quant += '10' - new_residual.append(a - neg) - decompr.append(neg) + new_residual.append(a + threshold) + decompr.append(-1*threshold) else: str_quant += '00' new_residual.append(a) @@ -4361,11 +4360,11 @@ def compute_expected(arr, neg, pos, curr_residual): return compr, new_residual, decompr def check(grad, residual): - exp_compr, exp_residual, exp_decompr = compute_expected(grad, neg_threshold, pos_threshold, residual) + exp_compr, exp_residual, exp_decompr = compute_expected(grad, residual, threshold) compr = mx.contrib.nd.create_2bit(grad) - mx.contrib.ndarray.quantize_2bit(grad, residual, compr, neg_threshold, pos_threshold) + mx.contrib.ndarray.quantize_2bit(grad, residual, compr, threshold) decompr = mx.nd.zeros(grad.shape) - mx.contrib.ndarray.dequantize_2bit(compr, decompr, neg_threshold, pos_threshold) + mx.contrib.ndarray.dequantize_2bit(compr, decompr, threshold) np.testing.assert_array_equal(compr.asnumpy(), np.array(exp_compr)) , (compr, exp_compr) np.testing.assert_array_equal(decompr.asnumpy(), np.array(exp_decompr)) , (decompr, exp_decompr) # use almost equal for residual as this involves addition operation diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index d9c9fbe930a1..472f954b14b8 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -53,7 +53,8 @@ def parse_args(): help='number of classes') parser.add_argument('--optimizer', type=str, default='None', help='the optimizer set to kvstore. None means no optimizer') - parser.add_argument('--compress', type=str, default='none') + parser.add_argument('--gc-type', type=str, default='none', + help='type of gradient compression') args = parser.parse_args() logging.info(args) return args @@ -73,11 +74,11 @@ def error(gpu_res, cpu_res): return res def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, - num_batches, test_results, compress, **kwargs): + num_batches, test_results, gc_type, **kwargs): # create kvstore and optimizer devs = [mx.gpu(int(i)) for i in gpus.split(',')] kv = mx.kv.create(kv_store) - kv.set_compress({'compress':compress,'pos_threshold':0.5, 'neg_threshold':-0.5}) + kv.set_gradient_compression({'compression': gc_type}) if optimizer is None or optimizer == 'None': opt = None else: From c4d9a453aa5412e5a961480c30feae1aca4b3fc9 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 23:13:23 +0000 Subject: [PATCH 183/237] redo header files --- include/mxnet/kvstore.h | 2 +- src/kvstore/gc.h | 11 ++++++----- src/operator/contrib/two_bit_quantize-inl.h | 10 ++++++---- src/operator/contrib/two_bit_quantize.cc | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 4dd0384ed66b..637324786562 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,7 +30,7 @@ #include #include #include -#include +#include "../../src/kvstore/gc.h" #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" diff --git a/src/kvstore/gc.h b/src/kvstore/gc.h index e3a3b28efc0f..7f61b43c0699 100644 --- a/src/kvstore/gc.h +++ b/src/kvstore/gc.h @@ -29,9 +29,10 @@ #include #include -#include -#include -#include +#include"../../include/mxnet/ndarray.h" +#include "../../include/mxnet/base.h" +#include +#include "../operator/contrib/two_bit_quantize-inl.h" // TODO check if it returns empty between two delims template @@ -134,7 +135,7 @@ class Gc { std::vector inputs = {from.data(), residual->data(), to->data()}; mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); } else { @@ -170,7 +171,7 @@ class Gc { std::vector inputs = {from.data(), to->data()}; mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var()}, mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); } else { diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index ce9e7b9ce943..15062881700c 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -32,8 +32,8 @@ #include #include #include -#include -#include +#include "../operator_common.h" +#include "../mxnet_op.h" namespace mxnet { namespace op { @@ -142,7 +142,9 @@ struct quantize_2bit { template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size + using namespace mshadow; + using namespace mxnet_op; + Kernel::template Launch(s, inputs[2].Size(), // compressed array size inputs[0].Size(), // original size inputs[2].dptr(), // compressed array inputs[0].dptr(), // original array @@ -233,7 +235,7 @@ struct dequantize_2bit { template void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size + mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size inputs[1].dptr(), // out array inputs[0].dptr(), // compressed array -1*threshold, // negative threshold diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index 8f215c6f340c..deedab434a51 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -22,7 +22,7 @@ * \brief registers quantize_2bit, dequantize_2bit * and create_2bit operators with nnvm */ -#include +#include "../elemwise_op_common.h" #include "./two_bit_quantize-inl.h" namespace mxnet { From 3a2060b7f203592e2de30baee62945d1f7f8e4c3 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 23:18:46 +0000 Subject: [PATCH 184/237] remove ps --- include/mxnet/ndarray.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 458a3e78d077..84ee9fa5e4d4 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -34,7 +34,6 @@ #include #include #include -#include "ps/ps.h" #include "./base.h" #include "./storage.h" #include "./engine.h" From 193586e5b2c278bba7836e79646326ea4d5fab17 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 23:36:53 +0000 Subject: [PATCH 185/237] remove unused header --- src/operator/contrib/two_bit_quantize-inl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 15062881700c..e6db0dd5bc2d 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -31,7 +31,6 @@ #include #include #include -#include #include "../operator_common.h" #include "../mxnet_op.h" @@ -143,8 +142,7 @@ template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { using namespace mshadow; - using namespace mxnet_op; - Kernel::template Launch(s, inputs[2].Size(), // compressed array size + mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size inputs[0].Size(), // original size inputs[2].dptr(), // compressed array inputs[0].dptr(), // original array From 75399ffdc0577e3eb452970496f5246810a69636 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 23:59:04 +0000 Subject: [PATCH 186/237] fix compile issues --- src/kvstore/gc.h | 8 ++-- src/operator/contrib/two_bit_quantize-inl.h | 41 ++++++++++++++++----- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/kvstore/gc.h b/src/kvstore/gc.h index 7f61b43c0699..02a6c56ffa0a 100644 --- a/src/kvstore/gc.h +++ b/src/kvstore/gc.h @@ -125,7 +125,7 @@ class Gc { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); } else { @@ -133,7 +133,7 @@ class Gc { if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var(), residual->var()}, @@ -161,7 +161,7 @@ class Gc { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); }, from.ctx(), {from.var()}, {to->var()}, mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { @@ -169,7 +169,7 @@ class Gc { if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); // Wait GPU kernel to complete ctx.get_stream()->Wait(); }, from.ctx(), {from.var()}, {to->var()}, diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index e6db0dd5bc2d..1891dd742fc8 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -138,19 +138,35 @@ struct quantize_2bit { } }; +//template +//void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, +// const float threshold) { +// mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size +// inputs[0].Size(), // original size +// inputs[2].dptr(), // compressed array +// inputs[0].dptr(), // original array +// inputs[1].dptr(), // residual array +// -1 * threshold, // negative threshold +// threshold); // positive threshold +//} + +#ifndef __CUDACC__ + void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold); +#endif template void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - using namespace mshadow; - mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * threshold, // negative threshold - threshold); // positive threshold + const float threshold) { + mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 * threshold, // negative threshold + threshold); // positive threshold } + // this function has been defined as quantize_2bit operator template void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, @@ -160,7 +176,7 @@ void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl(s, inputs, param.threshold); + Quantize2BitImpl(s, inputs, param.threshold); } inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, @@ -230,6 +246,11 @@ struct dequantize_2bit { } }; +#ifndef __CUDACC__ + void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold); +#endif + template void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { From e6e41e4071dcfa91786153fee16f321fa0093ad1 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 17:32:49 -0800 Subject: [PATCH 187/237] remove multiple delete of gc --- src/kvstore/kvstore_dist.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 49ffcd9a0ce0..7b6fc04a8b1a 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -63,7 +63,6 @@ class KVStoreDist : public KVStoreLocal { } virtual ~KVStoreDist() { - delete gc_; Engine::Get()->WaitForAll(); if (IsWorkerNode()) { if (barrier_before_exit_) { From a7d6c68118124a7f1915be002719a8354e845023 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 7 Nov 2017 17:52:27 -0800 Subject: [PATCH 188/237] add expected to local kvstore test --- tests/nightly/test_kvstore.py | 96 ++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index d9415865d06a..8facf9baea76 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -29,7 +29,7 @@ def check_diff_to_scalar(A, x, rank=None): keys = [3, 5, 7] # let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND -shapes = [(4, 4), (100, 100), (2000, 2000)]; +shapes = [(4, 4), (100, 100), (2000, 2000)] lr = .1 nworker = 4 @@ -69,6 +69,51 @@ def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5): for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) + def compute_expected(arr, curr_residual, threshold): + from struct import pack,unpack + def bits2int(bits): + bits = [int(x) for x in bits[::-1]] + x = 0 + for i in range(len(bits)): + x += bits[i]*2**i + return x + + def as_float32(s): + return unpack("f",pack("I", bits2int(s)))[0] + + # str_quant stores the quantized representation as a sequence of bits + str_quant = '' + new_residual = [] + decompr = [] + arr_npy = arr.asnumpy() + curr_res_npy = curr_residual.asnumpy() + for i, a in np.ndenumerate(arr_npy): + a += curr_res_npy[i] + if a >= threshold: + str_quant += '11' + new_residual.append(a - threshold) + decompr.append(threshold) + elif a <= (-1*threshold): + str_quant += '10' + new_residual.append(a + threshold) + decompr.append(-1*threshold) + else: + str_quant += '00' + new_residual.append(a) + decompr.append(0) + # append extra bits when size of array not a factor of 16 + if len(str_quant)%16 != 0: + str_quant += '0'*(16 - len(str_quant)%16) + + compr = [] + # converts the string generated into integers 32chars at a time + i = 0 + while i Date: Tue, 7 Nov 2017 17:59:36 -0800 Subject: [PATCH 189/237] fix operator compile issues --- src/operator/contrib/two_bit_quantize-inl.h | 76 ++++++++++++--------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 1891dd742fc8..bc57a1a7cf7b 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -138,33 +138,31 @@ struct quantize_2bit { } }; -//template -//void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, -// const float threshold) { -// mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size -// inputs[0].Size(), // original size -// inputs[2].dptr(), // compressed array -// inputs[0].dptr(), // original array -// inputs[1].dptr(), // residual array -// -1 * threshold, // negative threshold -// threshold); // positive threshold -//} +template +void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { + mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 * threshold, // negative threshold + threshold); // positive threshold +} + +inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} #ifndef __CUDACC__ - void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold); -#endif +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold); +#else template -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * threshold, // negative threshold - threshold); // positive threshold +inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); } +#endif // this function has been defined as quantize_2bit operator @@ -246,20 +244,30 @@ struct dequantize_2bit { } }; -#ifndef __CUDACC__ - void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold); -#endif template -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { +void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1*threshold, // negative threshold - threshold); // positive threshold + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1*threshold, // negative threshold + threshold); // positive threshold +} + +inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); +} + +#ifndef __CUDACC__ +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold); +#else +template +inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); } +#endif template void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, @@ -269,7 +277,7 @@ void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, const std::vector& outputs) { mshadow::Stream *s = ctx.get_stream(); const TwoBitParam& param = nnvm::get(attrs.parsed); - Dequantize2BitImpl(s, inputs, param.threshold); + Dequantize2BitImpl(s, inputs, param.threshold); } inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, From 7ec0655193175a8e3000d709c84750152328e666 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 7 Nov 2017 18:02:00 -0800 Subject: [PATCH 190/237] fix operator compile issues --- src/operator/contrib/two_bit_quantize-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index bc57a1a7cf7b..6da496e4f595 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -158,7 +158,7 @@ void Quantize2BitImpl(mshadow::Stream* s, const std::vector const float threshold); #else template -inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, +inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { Quantize2BitKernelLaunch(s, inputs, threshold); } @@ -263,7 +263,7 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector -inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, +inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { Dequantize2BitKernelLaunch(s, inputs, threshold); } From b72df8ee353238507a8d38e1818c8923838f1e8b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 10:27:54 -0800 Subject: [PATCH 191/237] fix operator compile and link issues --- include/mxnet/kvstore.h | 2 +- src/kvstore/gc.h | 200 -------------------- src/operator/contrib/two_bit_quantize-inl.h | 175 +---------------- src/operator/contrib/two_bit_quantize.cc | 86 --------- src/operator/contrib/two_bit_quantize.cu | 15 +- 5 files changed, 10 insertions(+), 468 deletions(-) delete mode 100644 src/kvstore/gc.h diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 637324786562..b47ca04e8e2a 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,7 +30,7 @@ #include #include #include -#include "../../src/kvstore/gc.h" +#include "./gc.h" #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" diff --git a/src/kvstore/gc.h b/src/kvstore/gc.h deleted file mode 100644 index 02a6c56ffa0a..000000000000 --- a/src/kvstore/gc.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file gc.h - * \brief Gradient compression for kvstore - * \author Rahul Huilgol - */ - -#ifndef MXNET_KVSTORE_GC_H -#define MXNET_KVSTORE_GC_H -#include -#include -#include - -#include"../../include/mxnet/ndarray.h" -#include "../../include/mxnet/base.h" -#include -#include "../operator/contrib/two_bit_quantize-inl.h" - -// TODO check if it returns empty between two delims -template -void split(const std::string &s, const char delim, Out result) { - std::stringstream ss; - ss.str(s); - std::string item; - while (std::getline(ss, item, delim)) { - *(result++) = item; - } -} - -enum CompressionType { - GC_NONE, GC_TWO_BIT -}; - -class Gc { - public: - Gc() { - type_ = GC_NONE; - active_ = false; - } - - virtual ~Gc() { } - - void SetParams(const std::string& compression_type, const float threshold) { - if (compression_type == "2bit") { - SetTwoBitCompression(threshold); - } - } - - void set_active() { - active_ = true; - } - - bool get_active_type() { - if (active_) return type_; - else return GC_NONE; - } - - void SetTwoBitCompression(const float threshold) { - type_ = GC_TWO_BIT; - threshold_ = threshold; - } - - std::string EncodeParams() { - std::string rval = std::to_string(type_); - if (type_ == GC_TWO_BIT) { - rval += "," + std::to_string(threshold_); - } - return rval; - } - - void DecodeParams(const std::string& s) { - std::vector elems; - split(s, ',', std::back_inserter(elems)); - type_ = static_cast(stoi(elems[0])); - if (elems.size()>1) { - if (!elems[1].empty()) { - threshold_ = stof(elems[1]); - } - } - } - - int GetCompressionFactor() { - if (type_ == GC_TWO_BIT) { - return 16; - } else { - LOG(FATAL) << "Unsupported compression type"; - return 0; - } - } - - int64_t GetCompressedSize(const int64_t original_size){ - const int bits = GetCompressionFactor(); - return ((original_size % bits == 0) ? - original_size / bits : - original_size / bits + 1); - } - - void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, - mxnet::NDArray *residual, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { - #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } - #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif - } - } else { - LOG(FATAL) << "Unsupported quantization of type " << type_; - } - } - - void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - const int a = from.ctx().dev_mask(); - const int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); - } else { - #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } - #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif - } - } else { - LOG(FATAL) << "Unsupported dequantization of type " << type_; - } - } - - CompressionType type_; - - bool active_; - - float threshold_ = 0; - -private: - -}; - - -#endif //MXNET_KVSTORE_GC_H diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h index 6da496e4f595..5e86a67a52e2 100644 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ b/src/operator/contrib/two_bit_quantize-inl.h @@ -37,68 +37,6 @@ namespace mxnet { namespace op { -struct init_mem_2bit { - // Initialize output array - MSHADOW_XINLINE static void Map(int i, float* out) { - *(out+i) = 0; - } -}; - -struct TwoBitParam : public dmlc::Parameter { - float threshold; - DMLC_DECLARE_PARAMETER(TwoBitParam) { - DMLC_DECLARE_FIELD(threshold) - .set_default(0.5) - .describe("Threshold to quantize values. " - "Must be greater than 0"); - } -}; - -template -void Create2BitArrayCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - // For now, this method can only compress the float data - mshadow::Stream *s = ctx.get_stream(); - // Init the memory of output to 0x00000000 - mxnet_op::Kernel::Launch(s, outputs[0].Size(), - outputs[0].dptr()); // compressed array - -} - -inline bool Create2BitArrayShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - CHECK_EQ(in_attrs->size(), 1U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); - // check input - CHECK(!shape_is_none(in_attrs->at(0))); - // output - int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 : - in_attrs->at(0).Size() / 16 + 1; - SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape{shape}); - return true; -} - -inline bool Create2BitArrayType(const nnvm::NodeAttrs &attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - CHECK_EQ(in_attrs->size(), 1U); - // 0. output array - CHECK_EQ(out_attrs->size(), 1U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`create_2bit_` only supports float32 input for now"; - TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); - return true; -} - struct quantize_2bit { MSHADOW_XINLINE static void Map(int out_block_id, int original_size, @@ -153,66 +91,7 @@ inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& Quantize2BitKernelLaunch(s, inputs, threshold); } -#ifndef __CUDACC__ -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold); -#else -template -inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); -} -#endif - - -// this function has been defined as quantize_2bit operator -template -void Quantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - const TwoBitParam& param = nnvm::get(attrs.parsed); - Quantize2BitImpl(s, inputs, param.threshold); -} - -inline bool Quantize2BitShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - // 1. residual array - // 2. compressed array - CHECK_EQ(in_attrs->size(), 3U); - CHECK(!shape_is_none(in_attrs->at(0))); - CHECK(!shape_is_none(in_attrs->at(1))); - CHECK_EQ(in_attrs->at(0).Size(), - in_attrs->at(1).Size()); - int shape = in_attrs->at(0).Size() % 16 == 0 ? - in_attrs->at(0).Size() / 16 : - in_attrs->at(0).Size() / 16 + 1; - CHECK_EQ(in_attrs->at(2).Size(), shape) - << "The size of output array is not equal to " - << "the size of compressed array"; - return true; -} - -inline bool Quantize2BitType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. input array - // 1. residual array - // 2. compressed array - CHECK_EQ(in_attrs->size(), 3U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[2], mshadow::kFloat32) - << "`quantize_2bit_` only supports float32 input for now"; - return true; -} +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); struct dequantize_2bit { // Decompress @@ -258,60 +137,8 @@ inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector Dequantize2BitKernelLaunch(s, inputs, threshold); } -#ifndef __CUDACC__ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -#else -template -inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); -} -#endif - -template -void Dequantize2BitCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - mshadow::Stream *s = ctx.get_stream(); - const TwoBitParam& param = nnvm::get(attrs.parsed); - Dequantize2BitImpl(s, inputs, param.threshold); -} - -inline bool Dequantize2BitShape(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. compressed array - // 1. original array - CHECK_EQ(in_attrs->size(), 2U); - // No output - CHECK_EQ(out_attrs->size(), 0U); - // check input - CHECK(!shape_is_none(in_attrs->at(0))); - CHECK(!shape_is_none(in_attrs->at(1))); - // TODO(huilgolr) check - CHECK_LE(in_attrs->at(1).Size(), - in_attrs->at(0).Size()*16) - << "The shape of the second input array are " - << "not equal to the original array."; - return true; -} - -inline bool Dequantize2BitType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - // 0. compressed array - // 1. original array - CHECK_EQ(in_attrs->size(), 2U); - // check input - CHECK_EQ((*in_attrs)[0], mshadow::kFloat32) - << "`dequantize_2bit_` only supports float32 input for now"; - CHECK_EQ((*in_attrs)[1], mshadow::kFloat32) - << "`dequantize_2bit_` only supports float32 input for now"; - return true; -} } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc index deedab434a51..0e9e060bbbf1 100644 --- a/src/operator/contrib/two_bit_quantize.cc +++ b/src/operator/contrib/two_bit_quantize.cc @@ -22,95 +22,9 @@ * \brief registers quantize_2bit, dequantize_2bit * and create_2bit operators with nnvm */ -#include "../elemwise_op_common.h" -#include "./two_bit_quantize-inl.h" namespace mxnet { namespace op { -DMLC_REGISTER_PARAMETER(TwoBitParam); - -NNVM_REGISTER_OP(_contrib_quantize_2bit) -.describe(R"code(Quantize an input tensor into using 2bits for each value using -user-specified thresholds, while storing quantization error in residual array. - -The quantize_2bit operator takes 4 arguments and is called as follows: -`quantize_2bit(array, residual, out, threshold)`. -The operator modifies `residual` and `out` arrays. -The `out`variable will be the quantized array. Note that, `out` array can be generated by -invoking `create_2bit(array)`, avoiding calculation of size of quantized array. -This `out` array represents quantized data. -The operation sums up array and residual, and then -applies the thresholds to quantize the data into one of three states -represented by 2bits. 16 such quantized floats in the original array -are packed together into one float in the `out` array. -The quantization error is stored in residual array. - -For example, assume the input array (gradient) is [5.0, -1.0, -5.0, -4.0], and the -residual is [0.0, -2.0, 0, 1.0]. Let the threshold be +4.0. -In this method, the elements whose (gradient + residual) >= threshold -will be quantized into 2-bits '01', and the elements -whose (gradient + residual) <= -1*threshold will be -quantized into 2-bits '10'. The other elements will be quantized -as '00'. Every 16 floats in the original array will be packed -into one float variable in the output array. - -In this example, 'out' has 1 element, which represents upto16 elements of -original array quantized into a single element. -The residual is also updated to [1.0, -3.0, -1.0, -3.0]. -)code" ADD_FILELINE) -.set_num_inputs(3) -.set_num_outputs(0) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", Quantize2BitShape) -.set_attr("FInferType", Quantize2BitType) -.set_attr("FCompute", Quantize2BitCompute) -.set_attr("FGradient", ElemwiseGradUseNone{"_quantize_2bit"}) -.set_attr("FMutateInputs", -[](const nnvm::NodeAttrs& attrs) { - return std::vector{1, 2}; -}) -.add_argument("gradient_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("residual_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("quantized_array", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_arguments(TwoBitParam::__FIELDS__()); - - -NNVM_REGISTER_OP(_contrib_create_2bit) - .describe(R"code(Generate an array with the right shape to store the input data after -two bit quantization. This array will be on the same context as input array. -)code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(1) -.set_attr("FInferShape", Create2BitArrayShape) -.set_attr("FInferType", Create2BitArrayType) -.set_attr("FCompute", Create2BitArrayCompute) -.set_attr("FGradient", ElemwiseGradUseNone{"_create_2bit"}) -.add_argument("input", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`"); - -NNVM_REGISTER_OP(_contrib_dequantize_2bit) -.describe(R"code(Dequantize an input tensor quantized by quantize_2bit. - -The dequantize_2bit operator takes three input arguments. The first input is a NDArray, -which has been generated by quantize_2bit(). -The second input is a NDArray that has the same shape as the original -array before quantizing. The operator replaces the contents of this array -with dequantized data. Third argument is threshold used to quantize the array. -)code" ADD_FILELINE) -.set_num_inputs(2) -.set_num_outputs(0) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", Dequantize2BitShape) -.set_attr("FInferType", Dequantize2BitType) -.set_attr("FCompute", Dequantize2BitCompute) -.set_attr("FGradient", ElemwiseGradUseNone{"_dequantize_2bit"}) -.set_attr("FMutateInputs", -[](const nnvm::NodeAttrs& attrs) { - return std::vector{1}; -}) -.add_argument("quantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_argument("dequantized_data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`") -.add_arguments(TwoBitParam::__FIELDS__()); - } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu index b90ff1840771..087cc9102a7c 100644 --- a/src/operator/contrib/two_bit_quantize.cu +++ b/src/operator/contrib/two_bit_quantize.cu @@ -27,14 +27,15 @@ namespace mxnet { namespace op { -NNVM_REGISTER_OP(_contrib_quantize_2bit) -.set_attr("FCompute", Quantize2BitCompute); +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} -NNVM_REGISTER_OP(_contrib_dequantize_2bit) -.set_attr("FCompute", Dequantize2BitCompute); - -NNVM_REGISTER_OP(_contrib_create_2bit) -.set_attr("FCompute", Create2BitArrayCompute); +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); +} } // namespace op } // namespace mxnet From 2913b5603fff5d0ec398f34eda1bf8885af2acc7 Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 8 Nov 2017 10:36:10 -0800 Subject: [PATCH 192/237] remove gc.cpp --- include/mxnet/gc.h | 188 +++++++++++++++++++++++++++++++++++++++++++++ src/kvstore/gc.cpp | 5 -- 2 files changed, 188 insertions(+), 5 deletions(-) create mode 100644 include/mxnet/gc.h delete mode 100644 src/kvstore/gc.cpp diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h new file mode 100644 index 000000000000..0a96b430255b --- /dev/null +++ b/include/mxnet/gc.h @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gc.h + * \brief Gradient compression for kvstore + * \author Rahul Huilgol + */ + +#ifndef MXNET_KVSTORE_GC_H +#define MXNET_KVSTORE_GC_H +#include +#include +#include +#include"./ndarray.h" +#include "./base.h" +#include +#include "../../src/operator/contrib/two_bit_quantize-inl.h" + +enum CompressionType { + GC_NONE, GC_TWO_BIT +}; + +class Gc { + public: + Gc() { + type_ = GC_NONE; + active_ = false; + } + + virtual ~Gc() { } + + void SetParams(const std::string& compression_type, const float threshold) { + if (compression_type == "2bit") { + SetTwoBitCompression(threshold); + } + } + + void set_active() { + active_ = true; + } + + bool get_active_type() { + if (active_) return type_; + else return GC_NONE; + } + + void SetTwoBitCompression(const float threshold) { + type_ = GC_TWO_BIT; + threshold_ = threshold; + } + + std::string EncodeParams() { + std::string rval = std::to_string(type_); + if (type_ == GC_TWO_BIT) { + rval += "," + std::to_string(threshold_); + } + return rval; + } + + void DecodeParams(const std::string& s) { + std::vector elems; + split(s, ',', std::back_inserter(elems)); + type_ = static_cast(stoi(elems[0])); + if (elems.size()>1) { + if (!elems[1].empty()) { + threshold_ = stof(elems[1]); + } + } + } + + int GetCompressionFactor() { + if (type_ == GC_TWO_BIT) { + return 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + return 0; + } + } + + int64_t GetCompressedSize(const int64_t original_size){ + const int bits = GetCompressionFactor(); + return ((original_size % bits == 0) ? + original_size / bits : + original_size / bits + 1); + } + + void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported quantization of type " << type_; + } + } + + void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + const int a = from.ctx().dev_mask(); + const int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported dequantization of type " << type_; + } + } + + CompressionType type_; + + bool active_; + + float threshold_ = 0; + +private: + +}; + + +#endif //MXNET_KVSTORE_GC_H diff --git a/src/kvstore/gc.cpp b/src/kvstore/gc.cpp deleted file mode 100644 index 68ea2ad0214f..000000000000 --- a/src/kvstore/gc.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// -// Created by Huilgol, Rahul on 11/1/17. -// - -#include "gc.h" From f2e2469bd602d69e85feb482efaba95f01693c9f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 10:48:07 -0800 Subject: [PATCH 193/237] add split function --- include/mxnet/gc.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 0a96b430255b..7ae93cb7f7ee 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -33,6 +33,17 @@ #include #include "../../src/operator/contrib/two_bit_quantize-inl.h" +// TODO check if it returns empty between two delims +template +void split(const std::string &s, const char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } +} + enum CompressionType { GC_NONE, GC_TWO_BIT }; From 30eae11f2c7483c397bcd6e7ecc634f3acf7f66e Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 13:17:19 -0800 Subject: [PATCH 194/237] move setting of active gc --- example/image-classification/common/fit.py | 2 +- include/mxnet/gc.h | 7 +++++-- src/kvstore/kvstore_dist.h | 5 ++--- src/operator/contrib/two_bit_quantize-inl.h | 1 - 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 2d02fece1ca9..fd5174afcaad 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -119,7 +119,7 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_gradient_compression({'compress':args.gc_type, 'threshold':args.gc_threshold}) + kv.set_gradient_compression({'compression':args.gc_type, 'threshold':args.gc_threshold}) # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 7ae93cb7f7ee..59af5fef1a3b 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -67,6 +67,10 @@ class Gc { active_ = true; } + bool get_active() { + return active_; + } + bool get_active_type() { if (active_) return type_; else return GC_NONE; @@ -185,14 +189,13 @@ class Gc { } } +private: CompressionType type_; bool active_; float threshold_ = 0; -private: - }; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 7b6fc04a8b1a..757a5756f701 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -183,8 +183,6 @@ class KVStoreDist : public KVStoreLocal { comm_buf_[key].WaitToWrite(); compr_buf_[key].WaitToWrite(); } - gc_->set_active(); - } else { // do nothing } @@ -303,7 +301,7 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - + if (do_merge && !gc_->get_active()) gc_->set_active(); for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; @@ -332,6 +330,7 @@ class KVStoreDist : public KVStoreLocal { // push to servers if (storage_type == kDefaultStorage) { if (gc_->get_active_type() == GC_NONE) { + std::cout<<"gc is none for push of key"< #include #include #include From f5ddf7f73539383758ac4d043cc44a0734fe0756 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 14:21:54 -0800 Subject: [PATCH 195/237] move all to gc.cpp, compile works for cpu --- include/mxnet/gc.h | 194 ++++++--------------------- include/mxnet/kvstore.h | 2 +- python/mxnet/kvstore.py | 4 +- src/kvstore/comm.h | 1 + src/kvstore/gc.cc | 283 ++++++++++++++++++++++++++++++++++++++++ src/kvstore/gc.cu | 35 +++++ 6 files changed, 364 insertions(+), 155 deletions(-) create mode 100644 src/kvstore/gc.cc create mode 100644 src/kvstore/gc.cu diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 59af5fef1a3b..49ef153c7e1c 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -31,18 +31,8 @@ #include"./ndarray.h" #include "./base.h" #include -#include "../../src/operator/contrib/two_bit_quantize-inl.h" - -// TODO check if it returns empty between two delims -template -void split(const std::string &s, const char delim, Out result) { - std::stringstream ss; - ss.str(s); - std::string item; - while (std::getline(ss, item, delim)) { - *(result++) = item; - } -} +namespace mxnet { + namespace kvstore { enum CompressionType { GC_NONE, GC_TWO_BIT @@ -50,153 +40,53 @@ enum CompressionType { class Gc { public: - Gc() { - type_ = GC_NONE; - active_ = false; - } - - virtual ~Gc() { } - - void SetParams(const std::string& compression_type, const float threshold) { - if (compression_type == "2bit") { - SetTwoBitCompression(threshold); - } - } - - void set_active() { - active_ = true; - } - - bool get_active() { - return active_; - } - - bool get_active_type() { - if (active_) return type_; - else return GC_NONE; - } - - void SetTwoBitCompression(const float threshold) { - type_ = GC_TWO_BIT; - threshold_ = threshold; - } - - std::string EncodeParams() { - std::string rval = std::to_string(type_); - if (type_ == GC_TWO_BIT) { - rval += "," + std::to_string(threshold_); - } - return rval; - } - - void DecodeParams(const std::string& s) { - std::vector elems; - split(s, ',', std::back_inserter(elems)); - type_ = static_cast(stoi(elems[0])); - if (elems.size()>1) { - if (!elems[1].empty()) { - threshold_ = stof(elems[1]); - } - } - } - - int GetCompressionFactor() { - if (type_ == GC_TWO_BIT) { - return 16; - } else { - LOG(FATAL) << "Unsupported compression type"; - return 0; - } - } - - int64_t GetCompressedSize(const int64_t original_size){ - const int bits = GetCompressionFactor(); - return ((original_size % bits == 0) ? - original_size / bits : - original_size / bits + 1); - } + Gc(); + + virtual ~Gc() {} + + void SetParams(const std::string& compression_type, const float threshold); + + void set_active(); + + bool get_active(); + + bool get_active_type(); + + void SetTwoBitCompression(const float threshold); + + std::string EncodeParams(); + + void DecodeParams(const std::string& s); + + int GetCompressionFactor(); + + int64_t GetCompressedSize(const int64_t original_size); void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, - mxnet::NDArray *residual, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { - #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - mxnet::op::Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } - #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif - } - } else { - LOG(FATAL) << "Unsupported quantization of type " << type_; - } - } - - void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - const int a = from.ctx().dev_mask(); - const int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); - } else { - #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - mxnet::op::Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } - #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif - } - } else { - LOG(FATAL) << "Unsupported dequantization of type " << type_; - } - } + mxnet::NDArray *residual, const int priority); + + void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); private: - CompressionType type_; +// void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); +// void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); +// +// void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); +// void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); +// +// template +// void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold); +// template +// void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold); - bool active_; + CompressionType type_; - float threshold_ = 0; + bool active_; -}; + float threshold_ = 0; +}; + } +} #endif //MXNET_KVSTORE_GC_H diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index b47ca04e8e2a..5389434fcecc 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -400,7 +400,7 @@ class KVStore { * starts with none, used after SetGradientCompression sets the type * currently there is no support for unsetting gradient compression */ - Gc* gc_; + kvstore::Gc* gc_; /** diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 119e08ff7fb9..80cf6b93981d 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -398,7 +398,7 @@ def set_gradient_compression(self, compression_params=None): compression: str type of low-bit quantization to be used for gradient compression - Can only be '2bit' for now. + Can only be '2bit' or `none` for now. 2bit gradient compression uses 2bit quantization with residual to compress gradients. It works by converts each value in the original gradient to use 2 bits, causing size of gradient to be 1/16th of the original gradient @@ -417,7 +417,7 @@ def set_gradient_compression(self, compression_params=None): raise ValueError('compression_params requires `compression` to be set') elif not isinstance(compression_params['compression'], string_types): raise TypeError('compression must be a string') - elif compression_params['compression'] not in ['2bit']: + elif compression_params['compression'] not in ['none','2bit']: raise ValueError('Unsupported type of compression') if compression_params['compression'] == '2bit': diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index c927962e5f4d..dbce922baa1d 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -30,6 +30,7 @@ #include #include #include "mxnet/ndarray.h" +#include #include "../ndarray/ndarray_function.h" #include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc new file mode 100644 index 000000000000..84338eda09b2 --- /dev/null +++ b/src/kvstore/gc.cc @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gc.cc + * \brief Gradient compression for kvstore + * \author Rahul Huilgol + */ +#include +#include + +namespace mxnet { + namespace kvstore { + +// TODO check if it returns empty between two delims +template +void split(const std::string &s, const char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } +} + +Gc::Gc() { + type_ = GC_NONE; + active_ = false; +} + +void Gc::SetParams(const std::string& compression_type, const float threshold) { + if (compression_type == "2bit") { + SetTwoBitCompression(threshold); + } +} + +void Gc::set_active() { + active_ = true; +} + +bool Gc::get_active() { + return active_; +} + +bool Gc::get_active_type() { + if (active_) return type_; + else return GC_NONE; +} + +void Gc::SetTwoBitCompression(const float threshold) { + type_ = GC_TWO_BIT; + threshold_ = threshold; +} + +std::string Gc::EncodeParams() { + std::string rval = std::to_string(type_); + if (type_ == GC_TWO_BIT) { + rval += "," + std::to_string(threshold_); + } + return rval; +} + +void Gc::DecodeParams(const std::string& s) { + std::vector elems; + split(s, ',', std::back_inserter(elems)); + type_ = static_cast(stoi(elems[0])); + if (elems.size()>1) { + if (!elems[1].empty()) { + threshold_ = stof(elems[1]); + } + } +} + +int Gc::GetCompressionFactor() { + if (type_ == GC_TWO_BIT) { + return 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + return 0; + } +} + +int64_t Gc::GetCompressedSize(const int64_t original_size){ + const int bits = GetCompressionFactor(); + return ((original_size % bits == 0) ? + original_size / bits : + original_size / bits + 1); +} + +struct quantize_2bit { + MSHADOW_XINLINE static void Map(int out_block_id, + int original_size, + float *out, + float *grad, + float *residual, + const float neg_threshold, + const float pos_threshold) { + float* compr_block = out + out_block_id; + // init to 0 + *compr_block = 0; + // start and end are indices in original grad array + int start = out_block_id << 4; + int end = start + 16; // <= original_size) ? start + 16 : original_size; + char* block_ptr = reinterpret_cast < char* > (compr_block); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + + for (int i = start; i < end && i < original_size; i++) { + // // adds 1 when i-start divisible by 4 + char* curr_byte = block_ptr + ((i-start)>>2); + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + residual[i] -= pos_threshold; + // set data to 11 + *curr_byte |= posbits[(i & 3)]; +// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< +void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { + mxnet::op::mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 * threshold, // negative threshold + threshold); // positive threshold +} + +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} + +struct dequantize_2bit { + // Decompress + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + const float neg_threshold, + const float pos_threshold) { + + float* outval = out + i; + char* ch_ptr = reinterpret_cast(in + (i>>4)); + + ch_ptr += ((i & 15) >> 2 ); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if ( masked == mask ) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if ( masked == negmask ) { + *outval = neg_threshold; + } else { + *outval = 0; + } + } +}; + +template +void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { + + mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1*threshold, // negative threshold + threshold); // positive threshold +} + +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); +} + +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); + + + void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported quantization of type " << type_; + } + } + + void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { + CHECK(from.shape().ndim() != 0) + << "source operands have zero dimension shape"; + // important: callback must always capture by value + const int a = from.ctx().dev_mask(); + const int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { + #if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } + #else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + #endif + } + } else { + LOG(FATAL) << "Unsupported dequantization of type " << type_; + } + } + + } +} + diff --git a/src/kvstore/gc.cu b/src/kvstore/gc.cu new file mode 100644 index 000000000000..3b8748d6257e --- /dev/null +++ b/src/kvstore/gc.cu @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gc.cu + * \brief for gpu + */ + +#include + +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} + +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); +} \ No newline at end of file From d3b668d687617cd1170178d896cae52c3d3278ba Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 14:51:39 -0800 Subject: [PATCH 196/237] WIP gpu compile --- src/kvstore/gc.cc | 313 ++++++++++++++++++++++++---------------------- src/kvstore/gc.cu | 20 +-- 2 files changed, 173 insertions(+), 160 deletions(-) diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 84338eda09b2..787789e56204 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -23,187 +23,196 @@ * \author Rahul Huilgol */ #include -#include +#include "../operator/mxnet_op.h" namespace mxnet { namespace kvstore { // TODO check if it returns empty between two delims -template -void split(const std::string &s, const char delim, Out result) { - std::stringstream ss; - ss.str(s); - std::string item; - while (std::getline(ss, item, delim)) { - *(result++) = item; - } -} + template + void split(const std::string &s, const char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } + } -Gc::Gc() { - type_ = GC_NONE; - active_ = false; -} + Gc::Gc() { + type_ = GC_NONE; + active_ = false; + } -void Gc::SetParams(const std::string& compression_type, const float threshold) { - if (compression_type == "2bit") { - SetTwoBitCompression(threshold); - } -} + void Gc::SetParams(const std::string &compression_type, const float threshold) { + if (compression_type == "2bit") { + SetTwoBitCompression(threshold); + } + } -void Gc::set_active() { - active_ = true; -} + void Gc::set_active() { + active_ = true; + } -bool Gc::get_active() { - return active_; -} + bool Gc::get_active() { + return active_; + } -bool Gc::get_active_type() { - if (active_) return type_; - else return GC_NONE; -} + bool Gc::get_active_type() { + if (active_) return type_; + else return GC_NONE; + } -void Gc::SetTwoBitCompression(const float threshold) { - type_ = GC_TWO_BIT; - threshold_ = threshold; -} + void Gc::SetTwoBitCompression(const float threshold) { + type_ = GC_TWO_BIT; + threshold_ = threshold; + } -std::string Gc::EncodeParams() { - std::string rval = std::to_string(type_); - if (type_ == GC_TWO_BIT) { - rval += "," + std::to_string(threshold_); - } - return rval; -} + std::string Gc::EncodeParams() { + std::string rval = std::to_string(type_); + if (type_ == GC_TWO_BIT) { + rval += "," + std::to_string(threshold_); + } + return rval; + } -void Gc::DecodeParams(const std::string& s) { - std::vector elems; - split(s, ',', std::back_inserter(elems)); - type_ = static_cast(stoi(elems[0])); - if (elems.size()>1) { - if (!elems[1].empty()) { - threshold_ = stof(elems[1]); + void Gc::DecodeParams(const std::string &s) { + std::vector elems; + split(s, ',', std::back_inserter(elems)); + type_ = static_cast(stoi(elems[0])); + if (elems.size() > 1) { + if (!elems[1].empty()) { + threshold_ = stof(elems[1]); + } + } } - } -} -int Gc::GetCompressionFactor() { - if (type_ == GC_TWO_BIT) { - return 16; - } else { - LOG(FATAL) << "Unsupported compression type"; - return 0; - } -} + int Gc::GetCompressionFactor() { + if (type_ == GC_TWO_BIT) { + return 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + return 0; + } + } -int64_t Gc::GetCompressedSize(const int64_t original_size){ - const int bits = GetCompressionFactor(); - return ((original_size % bits == 0) ? - original_size / bits : - original_size / bits + 1); -} + int64_t Gc::GetCompressedSize(const int64_t original_size) { + const int bits = GetCompressionFactor(); + return ((original_size % bits == 0) ? + original_size / bits : + original_size / bits + 1); + } -struct quantize_2bit { - MSHADOW_XINLINE static void Map(int out_block_id, - int original_size, - float *out, - float *grad, - float *residual, - const float neg_threshold, - const float pos_threshold) { - float* compr_block = out + out_block_id; - // init to 0 - *compr_block = 0; - // start and end are indices in original grad array - int start = out_block_id << 4; - int end = start + 16; // <= original_size) ? start + 16 : original_size; - char* block_ptr = reinterpret_cast < char* > (compr_block); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + struct quantize_2bit { + MSHADOW_XINLINE static void Map(int out_block_id, + int original_size, + float *out, + float *grad, + float *residual, + const float neg_threshold, + const float pos_threshold) { + float *compr_block = out + out_block_id; + // init to 0 + *compr_block = 0; + // start and end are indices in original grad array + int start = out_block_id << 4; + int end = start + 16; // <= original_size) ? start + 16 : original_size; + char *block_ptr = reinterpret_cast < char * > (compr_block); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - for (int i = start; i < end && i < original_size; i++) { - // // adds 1 when i-start divisible by 4 - char* curr_byte = block_ptr + ((i-start)>>2); - residual[i] += grad[i]; - if (residual[i] >= pos_threshold) { - residual[i] -= pos_threshold; - // set data to 11 - *curr_byte |= posbits[(i & 3)]; + for (int i = start; i < end && i < original_size; i++) { + // // adds 1 when i-start divisible by 4 + char *curr_byte = block_ptr + ((i - start) >> 2); + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + residual[i] -= pos_threshold; + // set data to 11 + *curr_byte |= posbits[(i & 3)]; // std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< -void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * threshold, // negative threshold - threshold); // positive threshold -} + template + void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + mxnet::op::mxnet_op::Kernel::Launch(s, + inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 * + threshold, // negative threshold + threshold); // positive threshold + } -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); -} + void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); + } -struct dequantize_2bit { - // Decompress - MSHADOW_XINLINE static void Map(int i, - float *out, - float *in, - const float neg_threshold, - const float pos_threshold) { + void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); - float* outval = out + i; - char* ch_ptr = reinterpret_cast(in + (i>>4)); + struct dequantize_2bit { + // Decompress + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + const float neg_threshold, + const float pos_threshold) { - ch_ptr += ((i & 15) >> 2 ); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } -}; + float *outval = out + i; + char *ch_ptr = reinterpret_cast(in + (i >> 4)); -template -void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { + ch_ptr += ((i & 15) >> 2); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if (masked == mask) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if (masked == negmask) { + *outval = neg_threshold; + } else { + *outval = 0; + } + } + }; - mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1*threshold, // negative threshold - threshold); // positive threshold -} + template + void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); -} + mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1 * + threshold, // negative threshold + threshold); // positive threshold + } -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); + void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); + } + void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, mxnet::NDArray *residual, const int priority) { @@ -221,7 +230,7 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vectorvar(), residual->var()}, mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); } else { - #if MXNET_USE_CUDA +#if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; @@ -233,9 +242,9 @@ void Dequantize2BitImpl(mshadow::Stream* s, const std::vector -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); -} +namespace mxnet{ + namespace kvstore{ + void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); + } -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); + void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); + } + } } \ No newline at end of file From f19e7ee47063c3448698fac19186bdda61991c85 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 15:04:43 -0800 Subject: [PATCH 197/237] compiles and links on both cpu and gpu --- src/kvstore/gc-inl.h | 101 +++++++++++++++++++++++++++++++++++++++++++ src/kvstore/gc.cc | 96 +--------------------------------------- src/kvstore/gc.cu | 1 + 3 files changed, 103 insertions(+), 95 deletions(-) create mode 100644 src/kvstore/gc-inl.h diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h new file mode 100644 index 000000000000..0df5782da562 --- /dev/null +++ b/src/kvstore/gc-inl.h @@ -0,0 +1,101 @@ +#include "../operator/mxnet_op.h" +namespace mxnet { + namespace kvstore{ + struct quantize_2bit { + MSHADOW_XINLINE static void Map(int out_block_id, + int original_size, + float *out, + float *grad, + float *residual, + const float neg_threshold, + const float pos_threshold) { + float *compr_block = out + out_block_id; + // init to 0 + *compr_block = 0; + // start and end are indices in original grad array + int start = out_block_id << 4; + int end = start + 16; // <= original_size) ? start + 16 : original_size; + char *block_ptr = reinterpret_cast < char * > (compr_block); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + + for (int i = start; i < end && i < original_size; i++) { + // // adds 1 when i-start divisible by 4 + char *curr_byte = block_ptr + ((i - start) >> 2); + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + residual[i] -= pos_threshold; + // set data to 11 + *curr_byte |= posbits[(i & 3)]; +// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< + void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + mxnet::op::mxnet_op::Kernel::Launch(s, + inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 * + threshold, // negative threshold + threshold); // positive threshold + } + + struct dequantize_2bit { + // Decompress + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + const float neg_threshold, + const float pos_threshold) { + + float *outval = out + i; + char *ch_ptr = reinterpret_cast(in + (i >> 4)); + + ch_ptr += ((i & 15) >> 2); + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if (masked == mask) { + *outval = pos_threshold; + } // use posbits for mask as posbits are 11 + // compare with negbits + else if (masked == negmask) { + *outval = neg_threshold; + } else { + *outval = 0; + } + } + }; + + template + void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + + mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1 * + threshold, // negative threshold + threshold); // positive threshold + } + + + } +} diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 787789e56204..1937930a05a7 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -24,6 +24,7 @@ */ #include #include "../operator/mxnet_op.h" +#include "./gc-inl.h" namespace mxnet { namespace kvstore { @@ -103,59 +104,6 @@ namespace mxnet { original_size / bits + 1); } - struct quantize_2bit { - MSHADOW_XINLINE static void Map(int out_block_id, - int original_size, - float *out, - float *grad, - float *residual, - const float neg_threshold, - const float pos_threshold) { - float *compr_block = out + out_block_id; - // init to 0 - *compr_block = 0; - // start and end are indices in original grad array - int start = out_block_id << 4; - int end = start + 16; // <= original_size) ? start + 16 : original_size; - char *block_ptr = reinterpret_cast < char * > (compr_block); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - - for (int i = start; i < end && i < original_size; i++) { - // // adds 1 when i-start divisible by 4 - char *curr_byte = block_ptr + ((i - start) >> 2); - residual[i] += grad[i]; - if (residual[i] >= pos_threshold) { - residual[i] -= pos_threshold; - // set data to 11 - *curr_byte |= posbits[(i & 3)]; -// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< - void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, - inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * - threshold, // negative threshold - threshold); // positive threshold - } - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, const float threshold) { Quantize2BitKernelLaunch(s, inputs, threshold); @@ -164,48 +112,6 @@ namespace mxnet { void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, const float threshold); - struct dequantize_2bit { - // Decompress - MSHADOW_XINLINE static void Map(int i, - float *out, - float *in, - const float neg_threshold, - const float pos_threshold) { - - float *outval = out + i; - char *ch_ptr = reinterpret_cast(in + (i >> 4)); - - ch_ptr += ((i & 15) >> 2); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if (masked == mask) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if (masked == negmask) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } - }; - - template - void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - - mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1 * - threshold, // negative threshold - threshold); // positive threshold - } - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, const float threshold) { Dequantize2BitKernelLaunch(s, inputs, threshold); diff --git a/src/kvstore/gc.cu b/src/kvstore/gc.cu index c3b6a0b73a82..b3586776aacd 100644 --- a/src/kvstore/gc.cu +++ b/src/kvstore/gc.cu @@ -23,6 +23,7 @@ */ #include +#include "./gc-inl.h" namespace mxnet{ namespace kvstore{ From 42cdbdf3ad64ba5c37ce3389943e62c1eeb65231 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 15:40:06 -0800 Subject: [PATCH 198/237] move prototypes to header --- include/mxnet/gc.h | 66 +++++++++++++++++++++----------------------- src/kvstore/gc-inl.h | 26 ++++++++++++++++- src/kvstore/gc.cc | 29 ++++++++----------- src/kvstore/gc.cu | 1 + 4 files changed, 69 insertions(+), 53 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 49ef153c7e1c..9af6e7e1ddb4 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -34,58 +34,56 @@ namespace mxnet { namespace kvstore { -enum CompressionType { - GC_NONE, GC_TWO_BIT -}; + enum CompressionType { + GC_NONE, GC_TWO_BIT + }; -class Gc { - public: - Gc(); + class Gc { + public: + Gc(); - virtual ~Gc() {} + virtual ~Gc() {} - void SetParams(const std::string& compression_type, const float threshold); + void SetParams(const std::string &compression_type, const float threshold); - void set_active(); + void set_active(); - bool get_active(); + bool get_active(); - bool get_active_type(); + bool get_active_type(); - void SetTwoBitCompression(const float threshold); + void SetTwoBitCompression(const float threshold); - std::string EncodeParams(); + std::string EncodeParams(); - void DecodeParams(const std::string& s); + void DecodeParams(const std::string &s); - int GetCompressionFactor(); + int GetCompressionFactor(); - int64_t GetCompressedSize(const int64_t original_size); + int64_t GetCompressedSize(const int64_t original_size); - void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, - mxnet::NDArray *residual, const int priority); + void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority); - void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); + void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); -private: -// void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -// void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -// -// void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -// void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); -// -// template -// void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold); -// template -// void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold); + private: + CompressionType type_; - CompressionType type_; + bool active_; - bool active_; + float threshold_ = 0; - float threshold_ = 0; + }; -}; + void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); + void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); + void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); + void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); } } diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h index 0df5782da562..2c19b7e38288 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gc-inl.h @@ -1,3 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gc.cu + * \author Rahul Huilgol + * \brief + */ + #include "../operator/mxnet_op.h" namespace mxnet { namespace kvstore{ @@ -96,6 +121,5 @@ namespace mxnet { threshold); // positive threshold } - } } diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 1937930a05a7..95745193fecc 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -23,13 +23,12 @@ * \author Rahul Huilgol */ #include -#include "../operator/mxnet_op.h" #include "./gc-inl.h" namespace mxnet { namespace kvstore { -// TODO check if it returns empty between two delims + // TODO check if it returns empty between two delims like (a,,b) template void split(const std::string &s, const char delim, Out result) { std::stringstream ss; @@ -104,22 +103,6 @@ namespace mxnet { original_size / bits + 1); } - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); - } - - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); - } - - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, mxnet::NDArray *residual, const int priority) { CHECK(from.shape().ndim() != 0) @@ -193,6 +176,16 @@ namespace mxnet { } } + void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); + } + + void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); + } + } } diff --git a/src/kvstore/gc.cu b/src/kvstore/gc.cu index b3586776aacd..f2cc2d8548ba 100644 --- a/src/kvstore/gc.cu +++ b/src/kvstore/gc.cu @@ -19,6 +19,7 @@ /*! * \file gc.cu + * \author Rahul Huilgol * \brief */ From 82f796422f70a282dc2e1354896c81255508339b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 8 Nov 2017 16:42:57 -0800 Subject: [PATCH 199/237] add split function --- include/mxnet/gc.h | 166 ++++++++++++++++---------- src/kvstore/gc-inl.h | 200 +++++++++++++++++-------------- src/kvstore/gc.cc | 276 +++++++++++++++++++++---------------------- src/kvstore/gc.cu | 25 ++-- 4 files changed, 362 insertions(+), 305 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 9af6e7e1ddb4..bced8a91d2a2 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -26,65 +26,111 @@ #ifndef MXNET_KVSTORE_GC_H #define MXNET_KVSTORE_GC_H #include -#include -#include #include"./ndarray.h" -#include "./base.h" -#include -namespace mxnet { - namespace kvstore { - - enum CompressionType { - GC_NONE, GC_TWO_BIT - }; - - class Gc { - public: - Gc(); - - virtual ~Gc() {} - - void SetParams(const std::string &compression_type, const float threshold); - - void set_active(); - - bool get_active(); - - bool get_active_type(); - - void SetTwoBitCompression(const float threshold); - - std::string EncodeParams(); - - void DecodeParams(const std::string &s); - int GetCompressionFactor(); - - int64_t GetCompressedSize(const int64_t original_size); - - void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, - mxnet::NDArray *residual, const int priority); - - void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); - - private: - CompressionType type_; - - bool active_; - - float threshold_ = 0; - - }; - - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold); - } -} - -#endif //MXNET_KVSTORE_GC_H +namespace mxnet { +namespace kvstore { + +enum CompressionType { + GC_NONE, GC_TWO_BIT +}; + +class Gc { +public: + Gc(); + + virtual ~Gc() {} + + /*! + * \brief sets parameters for gradient compression + * \param compression_type str representing types like 2bit + * \param threshold float value used for thresholding gradients + */ + void SetParams(const std::string &compression_type, const float threshold); + + /*! + * \brief sets gradient compression to active mode + * Active mode is when gradients are compressed + * Gc is in inactive mode during init of parameters + */ + void set_active(); + + /*! + * \brief returns boolean whether or not gc is in active mode + */ + bool get_active(); + + /*! + * \brief if gc is in active mode, returns type of compression set + * else returns GC_NONE + */ + bool get_active_type(); + + void SetTwoBitCompression(const float threshold); + + /*! + * \brief encodes parameters of gc into a string + */ + std::string EncodeParams(); + + /*! + * \brief decodes parameters of gc from a string and assigns them to member variables + */ + void DecodeParams(const std::string &s); + + /*! + * \brief returns compression factor, which is the factor by which size of gradient + * reduces when using a particular type of compression + */ + int GetCompressionFactor(); + + /*! + * \brief returns the size of compressed gradients given an original sized gradient array + */ + int64_t GetCompressedSize(const int64_t original_size); + + /*! + * \brief Issues quantize operation to be scheduled by the engine + * Compresses `from` into `to` and accumulates the quantization error + * into 'residual', using the quantization of type `type_` + * \param from the ndarray containing original data to be quantized + * \param to the target ndarray which contains quantized data + * \param residual the ndarray which accumulates quantization error + * \param priority Priority of the action. + */ + void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority); + + /*! + * \brief Issues dequantize operation to be scheduled by the engine + * Decompresses `from` into `to` using current parameters of `type` and `threshold` + * \param from the ndarray containing quantized data + * \param to the target ndarray which contains final dequantized data + * \param priority Priority of the action. + */ + void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); + +private: + /*! + * \brief denotes the type of gradient compression which has been set + */ + CompressionType type_; + + /*! + * \brief denotes whether gradient compression is active + * Value starts with false because we don't want initialization of parameters to be compressed. + * That would lead to bad convergence results. Currently after initialization, gc becomes active. + */ + bool active_; + + /*! + * \brief denotes threshold used for quantization and dequantization + * Must be a positive value. All positive gradients will be thresholded to `threshold_` and + * all negative gradients will be thresholded to -1*`threshold_` + */ + float threshold_ = 0; + +}; +} // namespace kvstore +} // namespace mxnet +#endif // MXNET_KVSTORE_GC_H diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h index 2c19b7e38288..1cabf1272e86 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gc-inl.h @@ -20,106 +20,126 @@ /*! * \file gc.cu * \author Rahul Huilgol - * \brief + * \brief Declares and defines functions used to quantize and dequantize data */ #include "../operator/mxnet_op.h" + namespace mxnet { - namespace kvstore{ - struct quantize_2bit { - MSHADOW_XINLINE static void Map(int out_block_id, - int original_size, - float *out, - float *grad, - float *residual, - const float neg_threshold, - const float pos_threshold) { - float *compr_block = out + out_block_id; - // init to 0 - *compr_block = 0; - // start and end are indices in original grad array - int start = out_block_id << 4; - int end = start + 16; // <= original_size) ? start + 16 : original_size; - char *block_ptr = reinterpret_cast < char * > (compr_block); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; +namespace kvstore{ - for (int i = start; i < end && i < original_size; i++) { - // // adds 1 when i-start divisible by 4 - char *curr_byte = block_ptr + ((i - start) >> 2); - residual[i] += grad[i]; - if (residual[i] >= pos_threshold) { - residual[i] -= pos_threshold; - // set data to 11 - *curr_byte |= posbits[(i & 3)]; -// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< *s, const std::vector &inputs, + const float threshold); +void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold); - template - void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, - inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * - threshold, // negative threshold - threshold); // positive threshold +struct quantize_2bit { + MSHADOW_XINLINE static void Map(int out_block_id, + int original_size, + float *out, + float *grad, + float *residual, + const float neg_threshold, + const float pos_threshold) { + // this block contains the compressed representation of upto 16 values starting from out_block_id*16 + float *compr_block = out + out_block_id; + // init to 0 + *compr_block = 0; + // start and end are indices in original grad array + int start = out_block_id << 4; + int end = (start + 16 <= original_size) ? start + 16 : original_size; + // cast as char* to manipulate bits of float addresses + char *block_ptr = reinterpret_cast < char * > (compr_block); + // masks to set bits when value meets pos_threshold + // 0xc0 is mask when value is to be represented by the first two bits in a char* + // 0xc0 means first two bits are set to 11 + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + // masks to set bits when value meets neg_threshold + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + for (int i = start; i < end; i++) { + // adds offset to reach appropriate byte + char *curr_byte = block_ptr + ((i - start) >> 2); + // adds gradient to existing residual to get updated grad + residual[i] += grad[i]; + if (residual[i] >= pos_threshold) { + // set data to 11 + *curr_byte |= posbits[(i & 3)]; + // reduce residual by pos_threshold + residual[i] -= pos_threshold; + } else if (residual[i] <= neg_threshold) { + // set data to 10 + *curr_byte |= negbits[(i & 3)]; + residual[i] -= neg_threshold; + } } + } +}; - struct dequantize_2bit { - // Decompress - MSHADOW_XINLINE static void Map(int i, - float *out, - float *in, - const float neg_threshold, - const float pos_threshold) { - - float *outval = out + i; - char *ch_ptr = reinterpret_cast(in + (i >> 4)); +template +void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + mxnet::op::mxnet_op::Kernel::Launch(s, + inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 *threshold, // negative threshold + threshold); // positive threshold +} - ch_ptr += ((i & 15) >> 2); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if (masked == mask) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if (masked == negmask) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } - }; +struct dequantize_2bit { + MSHADOW_XINLINE static void Map(int i, + float *out, + float *in, + const float neg_threshold, + const float pos_threshold) { + // get position of dequantized value to fill + float *outval = out + i; + // gets byte which holds quantized value for this position + char *ch_ptr = reinterpret_cast(in + (i >> 4)); + ch_ptr += ((i & 15) >> 2); + // masks used to quantize data + const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; + const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; + // col denotes which two bits of a byte are set for this value + // col=0 implies first two bits, col=3 implies last two bits,... + int col = i & 3; + uint8_t mask = posbits[col]; + uint8_t negmask = negbits[col]; + uint8_t masked = *ch_ptr & mask; + if (masked == mask) { + *outval = pos_threshold; + } else if (masked == negmask) { + // use posbits for mask as posbits are both 1s + // then compare masked with negbits to see if only negbits were set + *outval = neg_threshold; + } else { + *outval = 0; + } + } +}; - template - void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { +template +void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + mxnet::op::mxnet_op::Kernel::Launch(s, + inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1 *threshold, // negative threshold + threshold); // positive threshold +} - mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1 * - threshold, // negative threshold - threshold); // positive threshold - } +inline void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} - } +inline void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, + const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); } +} // namespace kvstore +} // namespace mxnet diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 95745193fecc..a64ce4ab7258 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -23,169 +23,163 @@ * \author Rahul Huilgol */ #include +#include +#include #include "./gc-inl.h" namespace mxnet { - namespace kvstore { - - // TODO check if it returns empty between two delims like (a,,b) - template - void split(const std::string &s, const char delim, Out result) { - std::stringstream ss; - ss.str(s); - std::string item; - while (std::getline(ss, item, delim)) { - *(result++) = item; - } - } +namespace kvstore { - Gc::Gc() { - type_ = GC_NONE; - active_ = false; - } +/*! + * \brief Splits a string into smaller strings using char as delimiter + * Example: "a,b,c,,d" is split into ["a","b","c","","d"] + * \param s string to split + * \param delim char to split string around + * \param result container for tokens extracted after splitting + */ +template +void split(const std::string &s, const char delim, Out result) { + std::stringstream ss; + ss.str(s); + std::string item; + while (std::getline(ss, item, delim)) { + *(result++) = item; + } +} - void Gc::SetParams(const std::string &compression_type, const float threshold) { - if (compression_type == "2bit") { - SetTwoBitCompression(threshold); - } - } +Gc::Gc() { + type_ = GC_NONE; + active_ = false; +} - void Gc::set_active() { - active_ = true; - } +void Gc::SetParams(const std::string &compression_type, const float threshold) { + if (compression_type == "2bit") { + SetTwoBitCompression(threshold); + } +} - bool Gc::get_active() { - return active_; - } +void Gc::set_active() { + active_ = true; +} - bool Gc::get_active_type() { - if (active_) return type_; - else return GC_NONE; - } +bool Gc::get_active() { + return active_; +} - void Gc::SetTwoBitCompression(const float threshold) { - type_ = GC_TWO_BIT; - threshold_ = threshold; - } +bool Gc::get_active_type() { + if (active_) return type_; + else return GC_NONE; +} - std::string Gc::EncodeParams() { - std::string rval = std::to_string(type_); - if (type_ == GC_TWO_BIT) { - rval += "," + std::to_string(threshold_); - } - return rval; - } +void Gc::SetTwoBitCompression(const float threshold) { + type_ = GC_TWO_BIT; + threshold_ = threshold; +} - void Gc::DecodeParams(const std::string &s) { - std::vector elems; - split(s, ',', std::back_inserter(elems)); - type_ = static_cast(stoi(elems[0])); - if (elems.size() > 1) { - if (!elems[1].empty()) { - threshold_ = stof(elems[1]); - } - } - } +std::string Gc::EncodeParams() { + std::string rval = std::to_string(type_); + if (type_ == GC_TWO_BIT) { + rval += "," + std::to_string(threshold_); + } + return rval; +} - int Gc::GetCompressionFactor() { - if (type_ == GC_TWO_BIT) { - return 16; - } else { - LOG(FATAL) << "Unsupported compression type"; - return 0; - } +void Gc::DecodeParams(const std::string &s) { + std::vector elems; + split(s, ',', std::back_inserter(elems)); + type_ = static_cast(stoi(elems[0])); + if (elems.size() > 1) { + if (!elems[1].empty()) { + threshold_ = stof(elems[1]); } + } +} - int64_t Gc::GetCompressedSize(const int64_t original_size) { - const int bits = GetCompressionFactor(); - return ((original_size % bits == 0) ? - original_size / bits : - original_size / bits + 1); - } +int Gc::GetCompressionFactor() { + if (type_ == GC_TWO_BIT) { + return 16; + } else { + LOG(FATAL) << "Unsupported compression type"; + return 0; + } +} - void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, - mxnet::NDArray *residual, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); - } else { +int64_t Gc::GetCompressedSize(const int64_t original_size) { + const int bits = GetCompressionFactor(); + return ((original_size % bits == 0) ? + original_size / bits : + original_size / bits + 1); +} + +void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, + mxnet::NDArray *residual, const int priority) { + CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; + int a = from.ctx().dev_mask(); + int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + } else { #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif - } - } else { - LOG(FATAL) << "Unsupported quantization of type " << type_; - } - } - - void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { - CHECK(from.shape().ndim() != 0) - << "source operands have zero dimension shape"; - // important: callback must always capture by value - const int a = from.ctx().dev_mask(); - const int b = to->ctx().dev_mask(); - const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { - if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); - } else { - #if MXNET_USE_CUDA - if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } - #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif - } - } else { - LOG(FATAL) << "Unsupported dequantization of type " << type_; - } } + } else { + LOG(FATAL) << "Unsupported quantization of type " << type_; + } +} - void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); +void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { + CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; + const int a = from.ctx().dev_mask(); + const int b = to->ctx().dev_mask(); + const float threshold = threshold_; + if (type_ == GC_TWO_BIT) { + if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + } else { +#if MXNET_USE_CUDA + if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; } - - void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif } - + } else { + LOG(FATAL) << "Unsupported dequantization of type " << type_; } } +} // namespace kvstore +} // namespace mxnet + diff --git a/src/kvstore/gc.cu b/src/kvstore/gc.cu index f2cc2d8548ba..9279e6a1f176 100644 --- a/src/kvstore/gc.cu +++ b/src/kvstore/gc.cu @@ -20,22 +20,19 @@ /*! * \file gc.cu * \author Rahul Huilgol - * \brief + * \brief Implementation for gpu version of code */ -#include #include "./gc-inl.h" -namespace mxnet{ - namespace kvstore{ - void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); - } +namespace mxnet { +namespace kvstore { +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Quantize2BitKernelLaunch(s, inputs, threshold); +} - void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); - } - } -} \ No newline at end of file +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { + Dequantize2BitKernelLaunch(s, inputs, threshold); +} +} // namespace kvstore +} // namespace mxnet \ No newline at end of file From b96c3c0cf0e42fc989c69db6df755bec440c7bf0 Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 8 Nov 2017 16:49:45 -0800 Subject: [PATCH 200/237] undo changes from master --- CMakeLists.txt | 2 +- nnvm | 2 +- omp_test.cpp | 66 -- src/io/image_io.cc | 2 +- src/ndarray/ndarray.cc | 2 + src/ndarray/ndarray_function.cc | 1 - src/ndarray/ndarray_function.h | 3 +- src/operator/contrib/two_bit_quantize-inl.h | 144 ---- src/operator/contrib/two_bit_quantize.cc | 30 - src/operator/contrib/two_bit_quantize.cu | 41 - tests/cpp/include/test_core_op.h | 111 --- tests/cpp/include/test_legacy_op.h | 51 -- tests/cpp/include/test_op.h | 546 ++---------- tests/cpp/include/test_op_runner.h | 17 - tests/cpp/include/test_perf.h | 19 + tests/cpp/include/test_util.h | 177 ++-- tests/cpp/operator/activation_perf.cc | 15 - tests/cpp/operator/batchnorm_test.cc | 912 +++++++++++--------- tests/cpp/operator/core_op_runner_test.cc | 85 -- tests/cpp/operator/coreop_perf.cc | 20 - tests/cpp/operator/fully_conn_perf.cc | 12 - tests/cpp/test_main.cc | 9 +- 22 files changed, 692 insertions(+), 1575 deletions(-) delete mode 100644 omp_test.cpp delete mode 100644 src/operator/contrib/two_bit_quantize-inl.h delete mode 100644 src/operator/contrib/two_bit_quantize.cc delete mode 100644 src/operator/contrib/two_bit_quantize.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 260da9923275..539515b3a25f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -440,7 +440,7 @@ else() target_link_libraries(mxnet PRIVATE "-Wl,--whole-archive $ -Wl,--no-whole-archive") target_link_libraries(mxnet PRIVATE mxnet_static) # Let cmake understand the dependency else() - add_library(mxnet SHARED ${SOURCE} src/kvstore/gc.cpp src/kvstore/gc.h) + add_library(mxnet SHARED ${SOURCE}) endif() endif() diff --git a/nnvm b/nnvm index c86afa8f17a4..8d79cfd0b42f 160000 --- a/nnvm +++ b/nnvm @@ -1 +1 @@ -Subproject commit c86afa8f17a44bcd4e6eec41cd49ba87e4f7a635 +Subproject commit 8d79cfd0b42fbe9f6ad75886d495065d5500b9dd diff --git a/omp_test.cpp b/omp_test.cpp deleted file mode 100644 index e4657ce3c345..000000000000 --- a/omp_test.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include -#include -#include "omp.h" -#include - -void quantize_2bit(float* data, float* res, float* compr, long long int size){ - // const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - // const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - #pragma omp parallel for - for(long long int i=0; i>4; i++) { - float* compr_block = compr + i; - *compr_block = 0; - int s=i<<4, e=s+16; - char* block_ptr = reinterpret_cast(compr_block); - const int posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const int negbits[] = {0x80, 0x20, 0x08, 0x02}; - // char* curr_byte = block_ptr; - for(int i=s; i= 0.5) { - res[i] -= 0.5; - *block_ptr |= posbits[i&3]; - } - else if(res[i] <= -0.5) { - res[i] += 0.5; - *block_ptr |= negbits[i&3]; - } - } - } -} - - - -int main() { - std::cout<<"openmp max threads are "<>4;i++){ - compr[i] = 0; - for(int j = i; j(t2-t1).count(); - std::cout << "time for " <(t4-t3).count(); - std::cout<< "time for quantizing "<PushSync([ndout, buff, fsize, param](RunContext ctx){ ImdecodeImpl(param.flag, param.to_rgb, buff, fsize, const_cast(&ndout)); - delete[] buff; + delete buff; }, ndout.ctx(), {}, {ndout.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE("Imread")); #else diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index bfba5460dbc3..275cf4038071 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -30,6 +30,7 @@ #include #include #include +#include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" @@ -557,6 +558,7 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) { } } + void CopyFromTo(const NDArray& from, NDArray *to, int priority) { CopyFromTo(from, *to, priority); } diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 810cf0e793c1..ef0adbe5f289 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -23,7 +23,6 @@ */ // this will be invoked by gcc and compile CPU version -#include #include "./ndarray_function.h" #include "./ndarray_function-inl.h" #include "../common/utils.h" diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h index 404684a5878f..f7da24601485 100644 --- a/src/ndarray/ndarray_function.h +++ b/src/ndarray/ndarray_function.h @@ -23,12 +23,13 @@ */ #ifndef MXNET_NDARRAY_NDARRAY_FUNCTION_H_ #define MXNET_NDARRAY_NDARRAY_FUNCTION_H_ -#include + #include #include #include #include #include +#include #include "../operator/mshadow_op.h" namespace mxnet { diff --git a/src/operator/contrib/two_bit_quantize-inl.h b/src/operator/contrib/two_bit_quantize-inl.h deleted file mode 100644 index a77bf8e85671..000000000000 --- a/src/operator/contrib/two_bit_quantize-inl.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file two_bit_quantize-inl.h - * \brief implementation of quantize_2bit operation - * \author Chao Ma, Rahul Huilgol - */ - -#ifndef MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ -#define MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mxnet_op.h" - -namespace mxnet { -namespace op { - -struct quantize_2bit { - MSHADOW_XINLINE static void Map(int out_block_id, - int original_size, - float *out, - float *grad, - float *residual, - const float neg_threshold, - const float pos_threshold) { - float* compr_block = out + out_block_id; - // init to 0 - *compr_block = 0; - // start and end are indices in original grad array - int start = out_block_id << 4; - int end = start + 16; // <= original_size) ? start + 16 : original_size; - char* block_ptr = reinterpret_cast < char* > (compr_block); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - - for (int i = start; i < end && i < original_size; i++) { - // // adds 1 when i-start divisible by 4 - char* curr_byte = block_ptr + ((i-start)>>2); - residual[i] += grad[i]; - if (residual[i] >= pos_threshold) { - residual[i] -= pos_threshold; - // set data to 11 - *curr_byte |= posbits[(i & 3)]; -// std::cout<<"pos "<< std::to_string(i&3) << " " << std::bitset<8>(*curr_byte)<(*curr_byte)<(*curr_byte)< -void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { - mxnet_op::Kernel::Launch(s, inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 * threshold, // negative threshold - threshold); // positive threshold -} - -inline void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); -} - -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold); - -struct dequantize_2bit { - // Decompress - MSHADOW_XINLINE static void Map(int i, - float *out, - float *in, - const float neg_threshold, - const float pos_threshold) { - - float* outval = out + i; - char* ch_ptr = reinterpret_cast(in + (i>>4)); - - ch_ptr += ((i & 15) >> 2 ); - const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03}; - const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; - if ( masked == mask ) { - *outval = pos_threshold; - } // use posbits for mask as posbits are 11 - // compare with negbits - else if ( masked == negmask ) { - *outval = neg_threshold; - } else { - *outval = 0; - } - } -}; - - -template -void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector& inputs, const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1*threshold, // negative threshold - threshold); // positive threshold -} - -inline void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); -} - -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold); - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_TWO_BIT_QUANTIZE_INL_H_ diff --git a/src/operator/contrib/two_bit_quantize.cc b/src/operator/contrib/two_bit_quantize.cc deleted file mode 100644 index 0e9e060bbbf1..000000000000 --- a/src/operator/contrib/two_bit_quantize.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file two_bit_quantize.cc - * \brief registers quantize_2bit, dequantize_2bit - * and create_2bit operators with nnvm - */ - -namespace mxnet { -namespace op { - -} // namespace op -} // namespace mxnet diff --git a/src/operator/contrib/two_bit_quantize.cu b/src/operator/contrib/two_bit_quantize.cu deleted file mode 100644 index 087cc9102a7c..000000000000 --- a/src/operator/contrib/two_bit_quantize.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file two_bit_quantize_sim.cu - * \brief registers quantize_2bit, dequantize_2bit - * and create_2bit operators for GPU - */ -#include "./two_bit_quantize-inl.h" - -namespace mxnet { -namespace op { - -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Quantize2BitKernelLaunch(s, inputs, threshold); -} - -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, - const float threshold) { - Dequantize2BitKernelLaunch(s, inputs, threshold); -} - -} // namespace op -} // namespace mxnet diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 4c2e9d2bf0de..21d0776fca4c 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -19,13 +19,10 @@ #ifndef TEST_CORE_OP_H_ #define TEST_CORE_OP_H_ -<<<<<<< HEAD -======= #include #include #include #include ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 #include "./test_op.h" #include "../../../src/imperative/imperative_utils.h" @@ -44,11 +41,7 @@ namespace op { */ template class CoreOpExecutor : public test::op::OperatorDataInitializer -<<<<<<< HEAD - , public test::op::OperatorExecutorTiming { -======= , public test::op::OperatorExecutorTiming { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! \brief Performance timing categories */ enum TimingId { Forward, @@ -63,17 +56,11 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer * \param cb Callback Function to call with CPU-data NDArray */ template -<<<<<<< HEAD - static inline void AccessAsCPU(const NDArray &src, const RunContext &run_ctx, CallbackFunction cb) { -#if MXNET_USE_CUDA - if(src.ctx().dev_type == Context::kCPU) { -======= static inline void AccessAsCPU(const NDArray &src, const RunContext &run_ctx, CallbackFunction cb) { #if MXNET_USE_CUDA if (src.ctx().dev_type == Context::kCPU) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 cb(src); } else { Context cpu_ctx, gpu_ctx = src.ctx(); @@ -103,12 +90,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer std::vector keys, values; keys.reserve(count); values.reserve(count); -<<<<<<< HEAD - for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end(); i_iter != e_iter; ++i_iter) { -======= for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end(); i_iter != e_iter; ++i_iter) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 keys.push_back(i_iter->first.c_str()); values.push_back(i_iter->second.c_str()); } @@ -121,12 +104,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer * \param dest Vector to store pointers to the NDArrays' data blobs * \return Reference to the supplied vector of TBlob results */ -<<<<<<< HEAD - static inline std::vector& CollectBlobs(std::vector& src, std::vector *dest) { -======= static inline std::vector& CollectBlobs(const std::vector& src, std::vector *dest) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 dest->reserve(dest->size() + src.size()); for (size_t i = 0, n = src.size(); i < n; ++i) { dest->push_back(src[i].data()); @@ -193,11 +172,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer std::cout << node_entry.node->op()->name << std::endl; } std::shared_ptr pOp = std::make_shared( -<<<<<<< HEAD - ctx().run_ctx.ctx.dev_type == Context::kGPU, outputs()[0].shape()); -======= ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(outputs())); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 res.push_back({ pOp, node_entry.node->op()->name }); } } @@ -205,41 +180,23 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer } /*! -<<<<<<< HEAD - * \brief Attach any temp or tandom resources required to perform the op's compute operation -======= * \brief Attach any temp or random resources required to perform the op's compute operation ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 * \param ctx Operator context object * \param attrs NodeAttrs structure (node attributes) * \param op Pointer to nnvm Operator object */ -<<<<<<< HEAD - void AttachResources(OpContext &ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) { - static auto& fresource = nnvm::Op::GetAttr("FResourceRequest"); - if (fresource.count(op) != 0) { - std::vector& requested = ctx.requested; -======= void AttachResources(OpContext *ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) { static auto& fresource = nnvm::Op::GetAttr("FResourceRequest"); if (fresource.count(op) != 0) { std::vector& requested = ctx->requested; ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 auto reqs = fresource[op](attrs); // Get the resource of temporal space. for (const ResourceRequest& req : reqs) { if (req.type == ResourceRequest::kTempSpace) { -<<<<<<< HEAD - Resource r = ResourceManager::Get()->Request(ctx.run_ctx.ctx, req); - requested.push_back(r); - } else if (req.type == ResourceRequest::kRandom) { - requested.push_back(ResourceManager::Get()->Request(ctx.run_ctx.ctx, req)); -======= Resource r = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req); requested.push_back(r); } else if (req.type == ResourceRequest::kRandom) { requested.push_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req)); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } else { LOG(FATAL) << "resource type not yet supported"; } @@ -263,11 +220,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer } } new_args.push_back({ COREOP_FWD_OP_NAME_KEY, fwd_op_name}); -<<<<<<< HEAD - if(!bwd_op_name.empty()) { -======= if (!bwd_op_name.empty()) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 new_args.push_back({ COREOP_BWD_OP_NAME_KEY, bwd_op_name}); } return new_args; @@ -285,11 +238,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer for (const auto& a : args) { if (a.first == COREOP_FWD_OP_NAME_KEY) { *fwd_op_name_ptr = a.second; -<<<<<<< HEAD - } else if(a.first == COREOP_BWD_OP_NAME_KEY) { -======= } else if (a.first == COREOP_BWD_OP_NAME_KEY) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 *bwd_op_name_ptr = a.second; } else { new_args.push_back(a); @@ -300,17 +249,11 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer /*! * \brief Constructor -<<<<<<< HEAD - */ - CoreOpExecutor(const bool isGPU, const TShape& shape) - : input_shape_(shape) -======= * \param isGPU Is this going to be on the GPU? * \param shapes Array of input shapes */ CoreOpExecutor(const bool isGPU, const std::vector& shapes) : input_shapes_(shapes) ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 , op_(nullptr) { ctx_.is_train = true; ctx_.run_ctx.ctx.dev_id = 0; @@ -349,11 +292,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer CHECK(!backward_for_op || bwd_op_name.empty()) << "Backward op should not be supplied another backward operator"; -<<<<<<< HEAD - if(verbose_ && backward_for_op) { -======= if (verbose_ && backward_for_op) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::cout << "Backward op: " << op_name; } @@ -371,15 +310,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer int inferred_num_outputs, num_visible_outputs; -<<<<<<< HEAD - imperative::SetNumOutputs(op_, attrs_, num_inputs, &inferred_num_outputs, &num_visible_outputs); - - // Generic, all shapes the same. Probably this will need to be adjusted for more complex - // operators such as dot - std::vector shapes(static_cast(std::max(num_visible_outputs, num_inputs)), - input_shape_); - -======= imperative::SetNumOutputs(op_, attrs_, num_inputs, &inferred_num_outputs, &num_visible_outputs); @@ -390,7 +320,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer shapes.push_back(i < input_shapes_.size() ? input_shapes_[i] : input_shapes_[input_shapes_.size() - 1]); } ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector inputs_p, outputs_p; if (!outputs.empty()) { @@ -403,10 +332,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer outputs_p.reserve(num_visible_outputs); for (int i = 0; i < num_inputs; ++i) { -<<<<<<< HEAD -======= CHECK_LT(i, static_cast(shapes.size())); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 inputs_.push_back(i < inputs.size() ? inputs[i] : CreateRandArray(shapes[i], ctx_.run_ctx.ctx)); inputs_p.push_back(&*inputs_.rbegin()); @@ -422,11 +348,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer outputs_p.push_back(&*outputs_.rbegin()); } -<<<<<<< HEAD - if(!backward_for_op) { -======= if (!backward_for_op) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 DispatchMode dispatch_mode = DispatchMode::kUndefined; imperative::SetShapeType(ctx_.run_ctx.ctx, attrs_, inputs_p, outputs_p, &dispatch_mode); } else { @@ -435,13 +357,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer for (int i = 0; i < num_visible_outputs; ++i) { CHECK_LT(static_cast(i), shapes.size()); // backward outputs should look like forward inputs -<<<<<<< HEAD - CHECK_EQ(backward_for_op->inputs()[i].shape(), outputs_[i].shape()); -======= // TODO(cjolivier01): This check fails for dot product... // Need better inference of backward shapes // CHECK_EQ(backward_for_op->inputs()[i].shape(), outputs_[i].shape()); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } } @@ -454,30 +372,17 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer function_ = common::GetFCompute(op_, "FCompute", ctx_.run_ctx.ctx); functionex_ = common::GetFCompute(op_, "FComputeEx", ctx_.run_ctx.ctx); -<<<<<<< HEAD - AttachResources(ctx_, attrs_, op_); - - if(!backward_for_op) { -======= AttachResources(&ctx_, attrs_, op_); if (!backward_for_op) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 bool no_backward = false; // Set up backward std::vector, std::string>> bwd; if (!bwd_op_name.empty()) { -<<<<<<< HEAD - if(bwd_op_name != COREOP_BWD_OP_NAME_VALUE_NONE) { - // Backward op was specified - std::shared_ptr pOp = std::make_shared( - ctx().run_ctx.ctx.dev_type == Context::kGPU, this->outputs()[0].shape()); -======= if (bwd_op_name != COREOP_BWD_OP_NAME_VALUE_NONE) { // Backward op was specified std::shared_ptr pOp = std::make_shared( ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(this->outputs())); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 bwd.push_back({pOp, bwd_op_name}); } else { no_backward = true; @@ -486,11 +391,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer // Try to figure out backward op bwd = GetBackward(); } -<<<<<<< HEAD - if(!no_backward) { -======= if (!no_backward) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 CHECK_GE(bwd.size(), 1U) << "Can't automatically determine backward op name. Please specify"; for (std::pair, std::string> &bw_item : bwd) { @@ -641,29 +542,17 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer */ OpContext ctx_; -<<<<<<< HEAD - #if MXNET_USE_CUDA -======= #if MXNET_USE_CUDA ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! \brief * Scoped GPU stream */ std::unique_ptr allocGPUStream_; -<<<<<<< HEAD - #endif -======= #endif ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! * \brief Input data shape */ -<<<<<<< HEAD - TShape input_shape_; -======= std::vector input_shapes_; ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /* * \brief Pointer to the operator object */ diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h index c3098e4f14ea..30bdf07b8b51 100644 --- a/tests/cpp/include/test_legacy_op.h +++ b/tests/cpp/include/test_legacy_op.h @@ -38,8 +38,6 @@ #ifndef TEST_LEGACY_OP_H_ #define TEST_LEGACY_OP_H_ -<<<<<<< HEAD -======= #include #include #include @@ -47,7 +45,6 @@ #include #include #include "../../../include/mxnet/operator.h" ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 #include "./test_op.h" #include "./test_op_runner.h" @@ -63,21 +60,13 @@ namespace op { */ template class LegacyOperatorExecutor : public OperatorDataInitializer -<<<<<<< HEAD - , public OperatorExecutorTiming { -======= , public OperatorExecutorTiming { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 public: typedef DType DataType; typedef AccReal AccRealType; /*! \brief Manage test blobs and context */ -<<<<<<< HEAD - LegacyOperatorExecutor(const bool isGPU, const TShape& topShape) -======= LegacyOperatorExecutor(const bool isGPU, const std::vector& topShapes) ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 #if !MXNET_USE_CUDA : isGPU_(false) #else @@ -88,13 +77,8 @@ class LegacyOperatorExecutor : public OperatorDataInitializer , initializeCallback_(0) { opContext_.is_train = true; opContext_.run_ctx.stream = nullptr; -<<<<<<< HEAD - - shape_input_vec_.push_back(topShape); -======= CHECK(!topShapes.empty()); shape_input_vec_ = topShapes; ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } inline mxnet::Context getContext() { @@ -196,11 +180,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer const std::vector req(c_.blob_output_vec_.size(), kWriteTo); // Possibly move data to/from CPU and GPU (outside of timing scope) MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? -<<<<<<< HEAD - new GPUOpData(c_, &opContext_) : nullptr)); -======= new GPUOpData(c_, &opContext_) : nullptr)); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward, "Forward", count); if (!isGPU_) { @@ -228,11 +208,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer const std::vector req(c_.blob_in_grad_.size(), kWriteTo); // Possibly move data to/from CPU and GPU (outside of timing scope) MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? -<<<<<<< HEAD - new GPUOpData(c_, &opContext_) : nullptr)); -======= new GPUOpData(c_, &opContext_) : nullptr)); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward, "Backward", count); if (!isGPU_) { @@ -377,11 +353,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer /*! \brief Runtime load of the C++ data code generated by dumpC() */ void load(const std::vector>>& cData, -<<<<<<< HEAD - const BlobVectorType type) { -======= const BlobVectorType type) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 CHECK_LT(type, cData.size()); for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) { const TBlob& blob = getBlobVect(type)[j]; @@ -394,11 +366,7 @@ class LegacyOperatorExecutor : public OperatorDataInitializer /*! \brief Runtime load of the C++ data code generated by dumpC() */ void load(const std::vector>>& cData, -<<<<<<< HEAD - const BlobVectorType type, const int idx) { -======= const BlobVectorType type, const int idx) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 CHECK_LT(type, cData.size()); CHECK_LT(idx, cData[type].size()); const TBlob& blob = getBlobVect(type)[idx]; @@ -419,27 +387,12 @@ class LegacyOperatorExecutor : public OperatorDataInitializer } } -<<<<<<< HEAD - std::vector& inputs() { - return c_.blob_input_vec_; - } - std::vector& outputs() { - return c_.blob_output_vec_; - } - std::vector& bwd_inputs() { - return c_.blob_out_grad_; - } - std::vector& bwd_outputs() { - return c_.blob_in_grad_; - } -======= std::vector& inputs() { return c_.blob_input_vec_; } const std::vector& inputs() const { return c_.blob_input_vec_; } std::vector& outputs() { return c_.blob_output_vec_; } const std::vector& outputs() const { return c_.blob_output_vec_; } std::vector& bwd_inputs() { return c_.blob_out_grad_; } std::vector& bwd_outputs() { return c_.blob_in_grad_; } ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! \brief Input and output blobs */ OpContext opContext_; @@ -596,12 +549,8 @@ class LegacyOperatorExecutor : public OperatorDataInitializer }; template -<<<<<<< HEAD -using LegacyOpRunner = mxnet::test::OperatorRunner>; -======= using LegacyOpRunner = mxnet::test::OperatorRunner>; ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } // namespace op } // namespace test diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h index 09238f7808ea..949f2ccdf421 100644 --- a/tests/cpp/include/test_op.h +++ b/tests/cpp/include/test_op.h @@ -38,9 +38,7 @@ #ifndef TEST_OP_H_ #define TEST_OP_H_ -#include "test_perf.h" -#include "test_util.h" - +#include #include #include #include @@ -51,6 +49,8 @@ #include #include #include +#include "./test_perf.h" +#include "./test_util.h" namespace mxnet { namespace test { @@ -63,6 +63,9 @@ namespace op { #endif #if MXNET_USE_CUDA +/*! + * \brief Maintain the lifecycle of a GPU stream + */ struct GPUStreamScope { explicit inline GPUStreamScope(OpContext *opContext) : opContext_(*opContext) { @@ -83,472 +86,58 @@ struct GPUStreamScope { #endif // MXNET_USE_CUDA /*! - * \brief Manage test blobs and context, and universal logic - * Create an operator from its "Prop" class and sets up the operator - * and resources for both forward and backward passes - * \tparam DType + * \brief Base class for operator test-data classes */ -template -class BasicOperatorData { +template +class OperatorDataInitializer { public: - /*! \brief Manage test blobs and context */ - BasicOperatorData(const bool isGPU, const TShape& topShape) -#if !MXNET_USE_CUDA - : isGPU_(false) -#else - : isGPU_(isGPU) -#endif - , initializeForward_(0) // unit testing may call inits in any order based - , initializeBackward_(0) // upon its use-case (ie may not want to run forward pass first) - , initializeCallback_(0) { - opContext_.is_train = true; - opContext_.run_ctx.stream = nullptr; - - shape_input_vec_.push_back(topShape); - } - - inline mxnet::Context getContext() { - return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{}; - } - - /*! \brief Initialize forward blob data values */ - virtual void resetForward() {} - - /*! \brief Initialize backward blob data values */ - virtual void resetBackward() {} - - /*! \brief Initialize auxiliary and output blobs */ - virtual bool initForward(const OperatorProperty &opProp, std::vector *in_type) { - if (!initializeForward_++) { - shape_input_vec_.resize(opProp.ListArguments().size()); - op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type)); - if (op_) { - // Figure out what sort of blobs we need to allocate - std::vector out_shape, aux_shape; - opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape); - std::vector out_type, aux_type; - opProp.InferType(in_type, &out_type, &aux_type); - - // Allocate top blobs (input) - for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { - int type; - if (x < in_type->size()) { - type = (*in_type)[x]; - } else { - type = x ? mshadow::DataType::kFlag : mshadow::DataType::kFlag; - } - - allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type); - } - - // Allocate aux blobs (scratch, hidden, etc.) - for (size_t x = 0, n = aux_shape.size(); x < n; ++x) { - CHECK(x < aux_type.size()); - allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]); - } - - // Allocate bottom blobs (output) - for (size_t x = 0, n = out_shape.size(); x < n; ++x) { - CHECK(x < out_type.size()); - allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]); - } - - // Get the resource of temporal space - std::vector inputShapes; - for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) { - inputShapes.push_back(shape_input_vec_[x]); - } - allocateResources(opProp.ForwardResource(inputShapes)); - - resetForward(); - return true; - } - return false; - } else { - return true; - } - } - - /*! \brief Initialize auxiliary and output blobs */ - virtual bool initBackward(const OperatorProperty &opProp, std::vector *in_type) { - initForward(opProp, in_type); - if (!initializeBackward_++) { - for (size_t x = 0, n = static_cast(opProp.NumVisibleOutputs()); x < n; ++x) { - CHECK_LT(x, c_.blob_input_vec_.size()); - allocateBlob(&c_.blob_out_grad_, c_.blob_input_vec_[x].shape_, - false, c_.blob_input_vec_[x].type_flag_); - } - - for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) { - allocateBlob(&c_.blob_in_grad_, c_.blob_input_vec_[x].shape_, - false, c_.blob_input_vec_[x].type_flag_); - } - - // Get the resource of temporal space - std::vector ishapes; - allocateResources(opProp.BackwardResource(ishapes)); - - resetBackward(); - return false; - } else { - return true; - } - } - - /*! \brief Run operator forward */ - void forward(const size_t count = 1) { - // Possibly move data to/from CPU and GPU (outside of timing scope) - MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? - new GPUOpData(c_, &opContext_) : nullptr)); - perf::TimingItem timeF(&timing_, Forward, "Forward", count); - if (!isGPU_) { - VTuneResume profile; // VTune sample only this scope - for (size_t x = 0; x < count; ++x) { - op()->Forward(opContext_, - c_.blob_input_vec_, - {kWriteTo, kWriteTo, kWriteTo}, - c_.blob_output_vec_, - c_.blob_aux_states_); - } - } else { - for (size_t x = 0; x < count; ++x) { - MXNET_CUDA_ONLY(op()->Forward(opContext_, - gpuData->blob_input_vec_, - {kWriteTo, kWriteTo, kWriteTo}, - gpuData->blob_output_vec_, - gpuData->blob_aux_states_)); - } - } - } - - /*! \brief Run operator backwards */ - void backward(const size_t count = 1) { - // Possibly move data to/from CPU and GPU (outside of timing scope) - MXNET_CUDA_ONLY(std::unique_ptr gpuData(isGPU_ ? - new GPUOpData(c_, &opContext_) : nullptr)); - perf::TimingItem timeB(&timing_, Backward, "Backward", count); - if (!isGPU_) { - VTuneResume profile; // VTune sample only this scope - for (size_t x = 0; x < count; ++x) { - op()->Backward(opContext_, - c_.blob_out_grad_, - c_.blob_input_vec_, - c_.blob_output_vec_, - {kWriteTo, kWriteTo, kWriteTo}, - c_.blob_in_grad_, - c_.blob_aux_states_); - } - } else { - for (size_t x = 0; x < count; ++x) { - MXNET_CUDA_ONLY(op()->Backward(opContext_, - gpuData->blob_out_grad_, - gpuData->blob_input_vec_, - gpuData->blob_output_vec_, - {kWriteTo, kWriteTo, kWriteTo}, - gpuData->blob_in_grad_, - gpuData->blob_aux_states_)); - } - } - } - - /*! \brief Getter functions for the operator */ - inline Operator *op() { return op_.get(); } - inline const Operator *op() const { return op_.get(); } - - enum BlobVectorType { - kInput, - kOutput, - kAux, - kInGrad, - kOutGrad, - kBlobVectorTypeCount - }; - - #define CASE_STR(__v$) case (__v$): return #__v$ - - /*! \brief Convert BlobVectorType enum into a string */ - static inline const char *bvt2String(const BlobVectorType bvt) { - switch (bvt) { - CASE_STR(kInput); - CASE_STR(kOutput); - CASE_STR(kAux); - CASE_STR(kInGrad); - CASE_STR(kOutGrad); - default: - CHECK(false); - return ""; - } - } - #undef CASE_STR - - /*! \brief Return a particular blob in a test data set */ - inline const std::vector& getBlobVect(const BlobVectorType bvt) const { - switch (bvt) { - case kInput: - return c_.blob_input_vec_; - case kOutput: - return c_.blob_output_vec_; - case kAux: - return c_.blob_aux_states_; - case kInGrad: - return c_.blob_in_grad_; - case kOutGrad: - return c_.blob_out_grad_; - default: - CHECK(false); - return c_.blob_input_vec_; - } + OperatorDataInitializer() + : generator_(new std::mt19937()) { } - /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation - * When writing an operator test, you can generate a "known good operator data state" in C++ - * code with this function, and then use load() to load the blob states into this - * class (BasicOperatorData). - * After that, you can compare with the "actual" operator state (BasicOperatorData) of - * the operator that you are testing. + /*! + * \brief Fill a blob with random values + * \param blob Blob which to fill with random values */ - template - inline void dumpC(Stream *_os, const std::string& label) { - Stream& os = *_os; - os << "static const std::vector< std::vector< std::vector > > ___" - << label << "_data_shape_"; - const TShape& shape = shape_input_vec_[0]; - for (size_t i = 0, n = shape.ndim(); i < n; ++i) { - os << shape[i] << "_"; - } - os << "__ =" << std::endl << "{" << std::endl; - for (size_t x = 0; x < kBlobVectorTypeCount; ++x) { - os << " { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl; - const std::vector& blobVect = getBlobVect(BlobVectorType(x)); - for (size_t i = 0, n = blobVect.size(); i < n; ++i) { - os << " { "; - test::dump(&os, blobVect[i]); - os << " }"; - if (i < n - 1) { - os << ","; - } - os << std::endl; - } - os << " }"; - if (x < kBlobVectorTypeCount - 1) { - os << ","; - } - os << std::endl; - } - os << "};" << std::endl; - } - - static inline void copy(const TBlob& blob, const DType array[], - const size_t start, const size_t end) { - const size_t blobSize = blob.Size(); - DType *p = blob.dptr(); - for (size_t i = 0, n = end - start; i < n; ++i) { - CHECK_LT(i, blobSize); - p[i] = array[i + start]; - } - } - - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData) { - for (size_t i = 0, ni = cData.size(); i < ni; ++i) { - for (size_t j = 0, nj = cData[i].size(); j < nj; ++j) { - const TBlob& blob = getBlobVect(BlobVectorType(i))[j]; - const size_t sourceDataSize = cData[i][j].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[i][j][0]; - copy(blob, sourceData, 0, sourceDataSize); - } - } - } - - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData, - const BlobVectorType type) { - CHECK_LT(type, cData.size()); - for (size_t j = 0, nj = cData[type].size(); j < nj; ++j) { - const TBlob& blob = getBlobVect(type)[j]; - const size_t sourceDataSize = cData[type][j].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[type][j][0]; - copy(blob, sourceData, 0, sourceDataSize); - } + void FillRandom(const TBlob& blob) const { + std::uniform_real_distribution distribution(-1.0, 1.0); + test::patternFill(&blob, [this, &distribution]() -> DType { + return distribution(this->generator()); + }); } - /*! \brief Runtime load of the C++ data code generated by dumpC() */ - void load(const std::vector>>& cData, - const BlobVectorType type, const int idx) { - CHECK_LT(type, cData.size()); - CHECK_LT(idx, cData[type].size()); - const TBlob& blob = getBlobVect(type)[idx]; - const size_t sourceDataSize = cData[type][idx].size(); - CHECK_EQ(sourceDataSize, blob.Size()); - const DType *sourceData = &cData[type][idx][0]; - copy(blob, sourceData, 0, sourceDataSize); + void FillZero(const TBlob& blob) const { + std::uniform_real_distribution distribution(-1.0, 1.0); + test::patternFill(&blob, [this, &distribution]() -> DType { + return DType(0); + }); } - /*! \brief Input and output blobs */ - OpContext opContext_; - - std::vector shape_input_vec_; - - struct OpData { - std::vector blob_input_vec_; - std::vector blob_output_vec_; - std::vector blob_aux_states_; - std::vector blob_in_grad_; - std::vector blob_out_grad_; // Remaining err (loss) pushing back upstream - - std::vector *> all_blob_vects_; - inline OpData() { - all_blob_vects_.push_back(&blob_input_vec_); - all_blob_vects_.push_back(&blob_output_vec_); - all_blob_vects_.push_back(&blob_aux_states_); - all_blob_vects_.push_back(&blob_in_grad_); - all_blob_vects_.push_back(&blob_out_grad_); // Remaining err (loss) pushing back upstream - } - virtual ~OpData() {} - }; - -#if MXNET_USE_CUDA - class GPUOpData : public OpData { - GPUOpData() = delete; - GPUOpData(const GPUOpData& o) = delete; - - public: - inline GPUOpData(const OpData& cpuData, OpContext *opContext) - : cpuData_(cpuData) - , allocGPUStream_(opContext) { - // Copy CPU->GPU - CHECK_EQ(gpuBlobs_.size(), 0U); - CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size()); - for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) { - std::vector& bv_src = *cpuData_.all_blob_vects_[bvt]; - std::vector& bvt_dest = *this->all_blob_vects_[bvt]; - for (size_t i = 0, n = bv_src.size(); i < n; ++i) { - const TBlob& srcBlob = bv_src[i]; - TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_, - true, srcBlob.type_flag_); - - Context cpu_ctx, gpu_ctx; - cpu_ctx.dev_type = Context::kCPU; - gpu_ctx.dev_type = Context::kGPU; - cpu_ctx.dev_id = gpu_ctx.dev_id = 0; - - mxnet::ndarray::Copy(srcBlob, destBlob, cpu_ctx, - gpu_ctx, allocGPUStream_.opContext_.run_ctx); - } - } - cudaDeviceSynchronize(); - } - inline ~GPUOpData() { - // Copy GPU->CPU - cudaDeviceSynchronize(); - for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) { - std::vector& bv_src = *this->all_blob_vects_[bvt]; - std::vector& bvt_dest = *cpuData_.all_blob_vects_[bvt]; - for (size_t i = 0, n = bv_src.size(); i < n; ++i) { - const TBlob& srcBlob = bv_src[i]; - TBlob *destBlob = &bvt_dest[i]; - - Context cpu_ctx, gpu_ctx; - cpu_ctx.dev_type = Context::kCPU; - gpu_ctx.dev_type = Context::kGPU; - cpu_ctx.dev_id = gpu_ctx.dev_id = 0; - - mxnet::ndarray::Copy(srcBlob, destBlob, gpu_ctx, - cpu_ctx, allocGPUStream_.opContext_.run_ctx); - } - } - gpuBlobs_.clear(); // Force deallocation of the GPU blob data - cudaDeviceSynchronize(); - } - - private: - /*! \brief Reference to the src/dest CPU data */ - const OpData& cpuData_; - /*! \brief The GPU-allocated blobs */ - std::list> gpuBlobs_; - /*! \brief Scoped GPU stream */ - GPUStreamScope allocGPUStream_; - }; -#endif // MXNET_USE_CUDA - - OpData c_; - - protected: - /*! \brief Allocate the operator's resource requests */ - void allocateResources(const std::vector& reqs) { - std::map cached_temp; - - Context ctx; - ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU; - ctx.dev_id = 0; - - for (const ResourceRequest& req : reqs) { - if (req.type == ResourceRequest::kTempSpace) { - if (cached_temp.count(ctx) != 0) { - opContext_.requested.push_back(cached_temp.at(ctx)); - } else { - Resource r = ResourceManager::Get()->Request(ctx, req); - opContext_.requested.push_back(r); - cached_temp[ctx] = r; - } - } else if (req.type == ResourceRequest::kRandom) { - opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req)); - } else { - LOG(FATAL) << "resource type not yet supported"; - } - } - } - - /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ - static TBlob *allocateBlob(std::list> *standalone_blobs, - std::vector *dest, - const TShape& shape, - const bool isGPU, - const int dtype) { - test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype); - CHECK_NE(blob, static_cast(nullptr)); - standalone_blobs->push_back(std::unique_ptr(blob)); - (*dest).push_back(*blob); - return blob; - } - - /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */ - inline TBlob *allocateBlob(std::vector *dest, const TShape& shape, - const bool isGPU, const int dtype) { - return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype); - } - - /*! \brief Performance timing categories */ - enum TimingId { - Forward, - Backward - }; + private: + /*! + * \brief mt19937 generator for random number generator + * \return reference to mt19937 generator object + */ + std::mt19937& generator() const { return *generator_; } - /*! \brief The operator */ - std::unique_ptr op_; - /*! \brief Is this for a GPU? */ - const bool isGPU_; - /*! \brief Assure that the Forward initialized only once */ - std::atomic initializeForward_; - /*! \brief Assure that the Forward initialized only once */ - std::atomic initializeBackward_; - /*! \brief Assure that the callback is initialized only once */ - std::atomic initializeCallback_; - /*! \brief scoped lifecycle management of allocated blobs */ - std::list> standalone_blobs_; + /*! \brief Per-test generator */ + std::unique_ptr generator_; +}; +class OperatorExecutorTiming { public: + inline test::perf::TimingInstrument& GetTiming() { return timing_; } + + private: /*! Timing instrumentation */ test::perf::TimingInstrument timing_; }; /*! \brief Top-level operator test state info structure */ -template +template struct OpInfo { /*! \brief The operator data */ - std::shared_ptr< test::op::BasicOperatorData > data_; + std::shared_ptr< OperatorExecutor > executor_; /*! \brief The operator prop class */ std::shared_ptr prop_; /*! \brief The input type(s) */ @@ -556,12 +145,12 @@ struct OpInfo { }; /*! \brief Pair of op info objects, generally for validating ops against each other */ -template +template struct OpInfoPair { /*! \brief Operator item 1 */ - test::op::OpInfo info_1_; + test::op::OpInfo info_1_; /*! \brief Operator item 2 */ - test::op::OpInfo info_2_; + test::op::OpInfo info_2_; }; /*! \brief Base validator class for validating test data */ @@ -657,49 +246,36 @@ class Validator { } return true; } - - /*! \brief Compare similar blobs in two operator data structs */ - static bool compare( - const test::op::BasicOperatorData& i1, - const test::op::BasicOperatorData& i2, - const typename test::op::BasicOperatorData::BlobVectorType bvt, - const size_t idx, bool print = false) { - const std::vector& bv1 = i1.getBlobVect(bvt); - const std::vector& bv2 = i2.getBlobVect(bvt); - - // If this is an invalid index, at least make sure the two blob vects - // are similarly too small for the index - if (bv1.size() <= idx) { - CHECK(bv1.size() == bv2.size()); - return true; - } - const TBlob& b1 = bv1[idx]; - const TBlob& b2 = bv2[idx]; - if (print && test::debugOutput) { - test::print(&(std::cout << "Blob 1:"), b1, true, true); - test::print(&(std::cout << "Blob 2:"), b2, true, true); - } - return compare(b1, b2); - } }; /*! \brief Operator Prop argument key/value pairs */ typedef std::vector > kwargs_t; /*! \brief Create operator data, prop, the operator itself and init default forward input */ -template -static test::op::OpInfo createOpAndInfoF(const bool isGPU, - const TShape &inputShape, - const kwargs_t &kwargs) { - test::op::OpInfo info; - info.data_ = std::make_shared(isGPU, inputShape); +template< + typename OperatorProp, + typename OperatorExecutor, + typename ...Args> +static test::op::OpInfo createOpAndInfoF(const kwargs_t &kwargs, + Args... args) { + test::op::OpInfo info; + info.executor_ = std::make_shared(args...); info.prop_ = std::make_shared(); - info.in_type_ = { mshadow::DataType::kFlag }; + info.in_type_ = { mshadow::DataType::kFlag }; info.prop_->Init(kwargs); - info.data_->initForward(*info.prop_, &info.in_type_); + info.executor_->initForward(*info.prop_, &info.in_type_); return info; } +inline std::vector ShapesOf(const std::vector& arrays) { + std::vector res; + res.reserve(arrays.size()); + for (const NDArray& ar : arrays) { + res.push_back(ar.shape()); + } + return std::move(res); +} + } // namespace op } // namespace test } // namespace mxnet diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 67a6a1d6390b..4c7cd1d774c0 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -43,10 +43,6 @@ template class OperatorRunner { public: typedef typename OperatorExecutor::DataType DType; -<<<<<<< HEAD - //typedef typename OperatorExecutor::AccRealType AccReal; -======= ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! * \brief Test operator forward pass @@ -71,11 +67,7 @@ class OperatorRunner { isGPU = false; #endif test::op::OpInfo info = -<<<<<<< HEAD - test::op::createOpAndInfoF(kwargs, isGPU, inputShape); -======= test::op::createOpAndInfoF(kwargs, isGPU, inputShapes); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 info.executor_->initForward(*info.prop_, &info.in_type_); info.executor_->forward(count); return info; @@ -111,13 +103,8 @@ class OperatorRunner { const std::vector > &kwargs, const size_t count = 1) { test::op::OpInfo info = -<<<<<<< HEAD - RunGenericOperatorForward(isGPU, inputShape, kwargs, count); - if(info.executor_->HasBackward()) { -======= RunGenericOperatorForward(isGPU, inputShapes, kwargs, count); if (info.executor_->HasBackward()) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 return RunGenericOperatorBackward(&info, count); } return info; @@ -224,11 +211,7 @@ class OperatorRunner { CHECK(false) << "Unsupported dimension count: " << (D + 1); } if (info.executor_) { -<<<<<<< HEAD - if(info.executor_->HasBackward()) { -======= if (info.executor_->HasBackward()) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 RunGenericOperatorBackward(&info, count); } timing += info.executor_->GetTiming(); diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h index d74d4d5a8976..b6f214576778 100644 --- a/tests/cpp/include/test_perf.h +++ b/tests/cpp/include/test_perf.h @@ -60,6 +60,25 @@ inline uint64_t getMicroTickCount() { #endif } +/*! \brief current timestamp: millionths of a second */ +inline uint64_t getNannoTickCount() { +#ifndef _WIN32 + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t(tv.tv_sec) * 1000000 + tv.tv_usec) * 1000; +#else + LARGE_INTEGER CurrentTime; + LARGE_INTEGER Frequency; + + QueryPerformanceFrequency(&Frequency); + QueryPerformanceCounter(&CurrentTime); + + CurrentTime.QuadPart *= 1000000000; + CurrentTime.QuadPart /= Frequency.QuadPart; + return CurrentTime.QuadPart; +#endif +} + /*! \brief millisecond tick count */ inline uint64_t getTickCount() { return getMicroTickCount() / 1000; diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h index 2b775efc77d0..95ab14195492 100644 --- a/tests/cpp/include/test_util.h +++ b/tests/cpp/include/test_util.h @@ -40,8 +40,9 @@ namespace mxnet { namespace test { extern bool unitTestsWithCuda; -extern bool debugOutput; +extern bool debug_output; extern bool quick_test; +extern bool performance_run; /*! \brief Pause VTune analysis */ struct VTunePause { @@ -71,6 +72,81 @@ struct VTuneResume { } }; + +template +inline size_t shapeMemorySize(const TShape& shape) { + return shape.Size() * sizeof(DType); +} + +class BlobMemory { + public: + explicit inline BlobMemory(const bool isGPU) : isGPU_(isGPU) { + this->handle_.dptr = nullptr; + } + inline ~BlobMemory() { + Free(); + } + void *Alloc(const size_t size) { + CHECK_GT(size, 0U); // You've probably made a mistake + mxnet::Context context = isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{}; + Storage *storage = mxnet::Storage::Get(); + handle_ = storage->Alloc(size, context); + return handle_.dptr; + } + void Free() { + if (handle_.dptr) { + Storage *storage = mxnet::Storage::Get(); + storage->DirectFree(handle_); + handle_.dptr = nullptr; + } + } + size_t Size() const { + return handle_.size; + } + + private: + const bool isGPU_; + Storage::Handle handle_; +}; + +class StandaloneBlob : public TBlob { + public: + inline StandaloneBlob(const TShape& shape, const bool isGPU, const int dtype) + : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype) + , memory_(std::make_shared(isGPU)) { + MSHADOW_TYPE_SWITCH(dtype, DType, { + this->dptr_ = memory_->Alloc(shapeMemorySize(shape)); }); + } + inline ~StandaloneBlob() { + this->dptr_ = nullptr; + } + inline size_t MemorySize() const { + return memory_->Size(); + } + + private: + /*! \brief Locally allocated memory block for this blob */ + std::shared_ptr memory_; +}; + +#if MXNET_USE_CUDA +/*! \brief Return blob in CPU memory */ +inline StandaloneBlob BlobOnCPU(const RunContext &rctx, const TBlob& src) { + StandaloneBlob res(src.shape_, false, src.type_flag_); + if (src.dev_mask() == cpu::kDevMask) { + LOG(WARNING) << "BlobOnCPU() is safe, but try not to call this with a CPU blob" + << " because it is inefficient"; + memcpy(res.dptr_, src.dptr_, res.MemorySize()); + } else { + mshadow::Stream *stream = rctx.get_stream(); + MSHADOW_TYPE_SWITCH(src.type_flag_, DType, { + mshadow::Copy(res.FlatTo1D(), src.FlatTo1D(stream), stream); + }); + } + return res; +} +#endif // MXNET_USE_CUDA + constexpr const size_t MPRINT_PRECISION = 5; template @@ -195,11 +271,18 @@ inline StreamType& print_shape(StreamType *_os, const std::string& label, const /*! \brief Pretty print a 1D, 2D, or 3D blob */ template -inline StreamType& print_blob_(StreamType *_os, +inline StreamType& print_blob_(const RunContext& ctx, + StreamType *_os, const TBlob &blob, const bool doChannels = true, const bool doBatches = true, const bool add_endl = true) { +#if MXNET_USE_CUDA + if (blob.dev_mask() == gpu::kDevMask) { + return print_blob_(ctx, _os, BlobOnCPU(ctx, blob), doChannels, doBatches, add_endl); + } +#endif // MXNET_USE_CUDA + StreamType &os = *_os; const size_t dim = static_cast(blob.ndim()); @@ -209,7 +292,7 @@ inline StreamType& print_blob_(StreamType *_os, changed.shape_[0] = 1; changed.shape_[1] = 1; changed.shape_[2] = blob.shape_[0]; - return print_blob_(&os, changed, false, false, add_endl); + return print_blob_(ctx, &os, changed, false, false, add_endl); } else if (dim == 2) { // probably a 2d tensor (mshadow::Tensor is deprecated) TBlob changed(blob.dptr(), TShape(4), blob.dev_mask(), blob.dev_id()); @@ -217,7 +300,7 @@ inline StreamType& print_blob_(StreamType *_os, changed.shape_[1] = 1; changed.shape_[2] = blob.shape_[0]; changed.shape_[3] = blob.shape_[1]; - return print_blob_(&os, changed, false, false, add_endl); + return print_blob_(ctx, &os, changed, false, false, add_endl); } CHECK_GE(dim, 3U) << "Invalid dimension zero (0)"; @@ -320,19 +403,20 @@ inline StreamType& print_blob_(StreamType *_os, } template -inline StreamType& print(StreamType *_os, +inline StreamType& print(const RunContext& ctx, + StreamType *_os, const TBlob &blob, const bool doChannels = true, const bool doBatches = true, const bool add_endl = true) { MSHADOW_TYPE_SWITCH(blob.type_flag_, DType, { - print_blob_(_os, blob, doChannels, doBatches, add_endl); + print_blob_(ctx, _os, blob, doChannels, doBatches, add_endl); }); return *_os; } template -inline StreamType& print(StreamType *_os, const std::string &label, +inline StreamType& print(const RunContext& ctx, StreamType *_os, const std::string &label, const TBlob &blob, const bool doChannels = true, bool doBatches = true, @@ -340,11 +424,12 @@ inline StreamType& print(StreamType *_os, const std::string &label, if (!label.empty()) { *_os << label << ": "; } - return print(_os, blob, doChannels, doBatches, add_endl); + return print(ctx, _os, blob, doChannels, doBatches, add_endl); } template -inline StreamType& print(StreamType *_os, const std::string& label, const NDArray& arr) { +inline StreamType& print(const RunContext& ctx, StreamType *_os, + const std::string& label, const NDArray& arr) { if (!label.empty()) { *_os << label << ": "; } @@ -356,12 +441,12 @@ inline StreamType& print(StreamType *_os, const std::string& label, const NDArra const TShape& storage_shape = arr.storage_shape(); const bool is_one_row = storage_shape[0] < 2; print_shape(_os, "storage shape", storage_shape, false); - print(_os, arr.data(), true, true, !is_one_row); + print(ctx, _os, arr.data(), true, true, !is_one_row); // indices const TShape& indices_shape = arr.aux_shape(rowsparse::kIdx); print_shape(_os, "indices shape", indices_shape, false); - print(_os, arr.aux_data(rowsparse::kIdx), true, true, false) << std::endl; + print(ctx, _os, arr.aux_data(rowsparse::kIdx), true, true, false) << std::endl; break; } case kCSRStorage: { @@ -371,17 +456,17 @@ inline StreamType& print(StreamType *_os, const std::string& label, const NDArra const TShape& storage_shape = arr.storage_shape(); const bool is_one_row = storage_shape[0] < 2; print_shape(_os, "storage shape", storage_shape, false); - print(_os, arr.data(), true, true, !is_one_row); + print(ctx, _os, arr.data(), true, true, !is_one_row); // row ptrs const TShape& ind_ptr_shape = arr.aux_shape(csr::kIndPtr); print_shape(_os, "row ptrs shape", ind_ptr_shape, false); - print(_os, arr.aux_data(csr::kIndPtr), true, true, false) << std::endl; + print(ctx, _os, arr.aux_data(csr::kIndPtr), true, true, false) << std::endl; // col indices const TShape& indices_shape = arr.aux_shape(csr::kIdx); print_shape(_os, "col indices shape", indices_shape, false); - print(_os, arr.aux_data(csr::kIdx), true, true, false) << std::endl; + print(ctx, _os, arr.aux_data(csr::kIdx), true, true, false) << std::endl; break; } @@ -390,7 +475,7 @@ inline StreamType& print(StreamType *_os, const std::string& label, const NDArra const TShape& shape = arr.shape(); const bool is_one_row = shape[0] < 2; print_shape(_os, "[dense] main shape", shape, !is_one_row); - print(_os, arr.data(), true, true, !is_one_row) << std::endl; + print(ctx, _os, arr.data(), true, true, !is_one_row) << std::endl; break; } default: @@ -400,25 +485,27 @@ inline StreamType& print(StreamType *_os, const std::string& label, const NDArra return *_os << std::flush; } -inline void print(const std::string& label, +inline void print(const RunContext& ctx, + const std::string& label, const std::string& var, const std::vector& arrays) { std::cout << label << std::endl; for (size_t x = 0, n = arrays.size(); x < n; ++x) { std::stringstream ss; ss << var << "[" << x << "]"; - test::print(&std::cout, ss.str(), arrays[x]); + test::print(ctx, &std::cout, ss.str(), arrays[x]); } } -inline void print(const std::string& label, +inline void print(const RunContext& ctx, + const std::string& label, const std::string& var, const std::vector& arrays) { std::cout << label << std::endl; for (size_t x = 0, n = arrays.size(); x < n; ++x) { std::stringstream ss; ss << var << "[" << x << "]"; - test::print(&std::cout, ss.str(), arrays[x], true, true, false); + test::print(ctx, &std::cout, ss.str(), arrays[x], true, true, false); } } @@ -443,56 +530,6 @@ inline std::string type_name() { return demangle(typeid(T).name()); } "<" << type_name<__op1>().name()) << ", " \ << type_name<__op2>() << ">"))->str(), __var) -template -inline size_t shapeMemorySize(const TShape& shape) { - return shape.Size() * sizeof(DType); -} - -class BlobMemory { - public: - explicit inline BlobMemory(const bool isGPU) : isGPU_(isGPU) { - this->handle_.dptr = nullptr; - } - inline ~BlobMemory() { - Free(); - } - void *Alloc(const size_t size) { - CHECK_GT(size, 0U); // You've probably made a mistake - mxnet::Context context = isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{}; - Storage *storage = mxnet::Storage::Get(); - handle_ = storage->Alloc(size, context); - return handle_.dptr; - } - void Free() { - if (handle_.dptr) { - Storage *storage = mxnet::Storage::Get(); - storage->DirectFree(handle_); - handle_.dptr = nullptr; - } - } - - private: - const bool isGPU_; - Storage::Handle handle_; -}; - -class StandaloneBlob : public TBlob { - public: - inline StandaloneBlob(const TShape& shape, const bool isGPU, const int dtype) - : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype) - , memory_(isGPU) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - this->dptr_ = memory_.Alloc(shapeMemorySize(shape)); }); - } - inline ~StandaloneBlob() { - this->dptr_ = nullptr; - memory_.Free(); - } - private: - /*! \brief Locally allocated memory block for this blob */ - BlobMemory memory_; -}; - /*! \brief Fill blob with some pattern defined by the getNextData() callback * Pattern fill in the defined order (important for analysis): * 1D: batch item -> channel -> depth -> row -> col @@ -500,7 +537,7 @@ class StandaloneBlob : public TBlob { * 3D: batch item -> channel -> col */ template -static inline void patternFill(TBlob *blob, GetNextData getNextData) { +static inline void patternFill(const TBlob *blob, GetNextData getNextData) { const size_t dim = blob->ndim(); CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)"; const size_t num = blob->size(0); diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index fed5aff674ae..65bd9aaf4064 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -27,10 +27,7 @@ #include #include "../include/test_op_runner.h" #include "../include/test_legacy_op.h" -<<<<<<< HEAD -======= #include "../../src/operator/activation-inl.h" ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 using namespace mxnet; @@ -45,11 +42,7 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); test::op::LegacyOpRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, shape, kwargs, 1); -======= runner.RunBidirectional(false, { shape }, kwargs, 1); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } /*! @@ -60,13 +53,9 @@ TEST(ACTIVATION_PERF, TimingCPU) { // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); test::op::LegacyOpRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {10, 10, 10, 10}, kwargs, 1); // prime code and cache -======= runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes; if (test::performance_run) { shapes = { @@ -97,13 +86,9 @@ TEST(ACTIVATION_PERF, TimingGPU) { kwargs.push_back({"act_type", "tanh"}); test::OperatorRunner> runner; -<<<<<<< HEAD - runner.RunBidirectional(true, {10, 10, 10, 10}, kwargs, 1); // prime code and cache -======= runner.RunBidirectional(true, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes = { {1, 1, 28, 28}, {1, 3, 28, 28}, diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index f2bc73afea00..24b5600a713c 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -27,7 +27,7 @@ #include #include "../../src/operator/batch_norm-inl.h" #include "../../src/operator/batch_norm_v1-inl.h" -#include "test_op.h" +#include "./test_legacy_op.h" #include "executor/exec_pass.h" using namespace mxnet; @@ -35,7 +35,7 @@ using namespace mxnet; #define SIMPLE_DIMENSIONS 0 #define MXNET_DUMP_C 0 #define DISABLE_VALIDATION 0 // If performance profiling, may do things - // that cause validation to fail +// that cause validation to fail #if !SIMPLE_DIMENSIONS static constexpr int BATCH_SIZE = 5; @@ -57,11 +57,125 @@ static constexpr int TIMING_DEPTH = 2; static constexpr int TIMING_DH = 28; static constexpr int TIMING_DW = 28; + +/*! \brief BatchNorm-specific test data */ +template +class BNOperatorExecutor : public test::op::LegacyOperatorExecutor { + public: + BNOperatorExecutor(const bool isGPU, const TShape& inputShape, + const bool hasWeightAndBias = false) + : test::op::LegacyOperatorExecutor(isGPU, { inputShape }) + , hasWeightAndBias_(hasWeightAndBias) { + } + + void resetForward() override { + // Init input data + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_, + DTypeX, + { + DTypeX val = 0; + test::patternFill(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData], + [&val]{ return val += 1; }); }); + + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, { + const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma]; + test::fill(blob, DTypeX(1)); + if (hasWeightAndBias_) { + if (blob.size(0) > 1) { + blob.dptr()[1] = DTypeX(3); + } + } + }); + MSHADOW_TYPE_SWITCH( + this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, { + const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta]; + if (!hasWeightAndBias_) { + test::fill(blob, DTypeX(0)); + } else { // This will cause forward pass check to fail when calculating sum == 0 + test::fill(blob, DTypeX(1)); + if (blob.size(0) > 0) { + blob.dptr()[0] = DTypeX(3); + } + } + }); + + // Init the moving data (all mean = 0, all var = 1) + MSHADOW_TYPE_SWITCH( + this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_, + DTypeX, { + test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0)); + }); + MSHADOW_TYPE_SWITCH( + this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_, + DTypeX, { + test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));}); + + for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) { + const int dtype = this->c_.blob_output_vec_[i].type_flag_; + MSHADOW_TYPE_SWITCH(dtype, DTypeX, + { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); }); + } + } + + void resetBackward() override { + DType val = -.001; + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_, + DTypeX, { + test::patternFill(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut], + [&val]{ return val += 1; }); + }); + + // out-grad weights + if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); }); + } + + // out-grad biases + if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); }); + } + + // in-grad + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); }); + + // in-grad weights + if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); }); + } + + // in-grad biases + if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) { + MSHADOW_TYPE_SWITCH( + this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_, + DTypeX, + { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); }); + } + } + + const bool hasWeightAndBias_; // This will cause forward pass validation to fail +}; + /*! \brief Validate batch norm test outputs */ template class BatchNormValidator : public test::op::Validator { typedef test::op::Validator Super; - using Super::compare; /*! \brief Only static functions in this class */ BatchNormValidator() = delete; @@ -107,7 +221,7 @@ class BatchNormValidator : public test::op::Validator { // expect unit variance EXPECT_NEAR(1, var, kErrorBound); if (!Super::isNear(AccReal(1), var, kErrorBound)) { - LOG(WARNING) << "Variance is not close enough to 1" + LOG(WARNING) << "Variance is not close enough to 1 " << saveSum << " (" << sum << "), " << saveVar << " (" << var << ")"; } @@ -211,7 +325,7 @@ class BatchNormValidator : public test::op::Validator { // expect unit variance EXPECT_NEAR(1, var, kErrorBound); if (!Super::isNear(AccReal(1), var, kErrorBound)) { - LOG(WARNING) << "Variance is not close enough to 1" + LOG(WARNING) << "Variance is not close enough to 1 " << saveSum << " (" << sum << "), " << saveVar << " (" << var << ")"; } @@ -220,10 +334,39 @@ class BatchNormValidator : public test::op::Validator { } public: + template + static inline bool compare(const ExecutorType& i1, + const ExecutorType& i2, + const typename + test::op::LegacyOperatorExecutor::BlobVectorType bvt, + const size_t idx, bool print = false) { + // Validate legacy data + auto *legacy1 = dynamic_cast *>(&i1); + auto *legacy2 = dynamic_cast *>(&i2); + CHECK_NOTNULL(legacy1); + CHECK_NOTNULL(legacy2); + const std::vector &bv1 = legacy1->getBlobVect(bvt); + const std::vector &bv2 = legacy2->getBlobVect(bvt); + + // If this is an invalid index, at least make sure the two blob vects + // are similarly too small for the index + if (bv1.size() <= idx) { + CHECK(bv1.size() == bv2.size()); + return true; + } + const TBlob &b1 = bv1[idx]; + const TBlob &b2 = bv2[idx]; + if (print && test::debug_output) { + test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true); + test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true); + } + return test::op::Validator::compare(b1, b2); + } + /*! \brief Check batch norm output */ template static void validateForward(const BNOperatorProp& data) { - const TBlob& outputBlob = data.c_.blob_output_vec_[mxnet::op::batchnorm::kData]; + const TBlob& outputBlob = data.outputs()[mxnet::op::batchnorm::kData]; switch (outputBlob.ndim()) { case 3: checkBatchNorm1D(&outputBlob); @@ -242,169 +385,57 @@ class BatchNormValidator : public test::op::Validator { /*! \brief Compare entire operator data between two test sets */ template - static void compare(const test::op::OpInfo& info_1, - const test::op::OpInfo& info_2) { + static void compare( + const test::op::OpInfo>& info_1, + const test::op::OpInfo>& info_2) { // Input - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kData)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kGamma)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, op::batchnorm::kBeta)); // Output - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutput, op::batchnorm::kOut)); CHECK_EQ(info_2.prop_->getParam().use_global_stats, info_1.prop_->getParam().use_global_stats); #if MXNET_USE_CUDNN != 1 /* CUDNN takes a different approach here on first pass */ // Aux - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kAux, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kAux, op::batchnorm::kMovingMean)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kAux, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kAux, op::batchnorm::kMovingVar)); #endif if (!info_2.prop_->getParam().use_global_stats) { - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutput, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutput, op::batchnorm::kMean)); // InGrad - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kData)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kGamma)); - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInGrad, op::batchnorm::kBeta)); // OutGrad - EXPECT_TRUE(compare(*info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kOutGrad, + EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kOutGrad, op::batchnorm::kData)); } } }; -/*! \brief BatchNorm-specific test data */ -template -class BNOperatorData : public test::op::BasicOperatorData { - public: - BNOperatorData(const bool isGPU, const TShape& inputShape, const bool hasWeightAndBias = false) - : test::op::BasicOperatorData(isGPU, inputShape) - , hasWeightAndBias_(hasWeightAndBias) { - } - - void resetForward() override { - // Init input data - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_, - DTypeX, - { - DTypeX val = 0; - test::patternFill(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData], - [&val]{ return val += 1; }); }); - - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, { - const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma]; - test::fill(blob, DTypeX(1)); - if (hasWeightAndBias_) { - if (blob.size(0) > 1) { - blob.dptr()[1] = DTypeX(3); - } - } - }); - MSHADOW_TYPE_SWITCH( - this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, { - const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta]; - if (!hasWeightAndBias_) { - test::fill(blob, DTypeX(0)); - } else { // This will cause forward pass check to fail when calculating sum == 0 - test::fill(blob, DTypeX(1)); - if (blob.size(0) > 0) { - blob.dptr()[0] = DTypeX(3); - } - } - }); - - // Init the moving data (all mean = 0, all var = 1) - MSHADOW_TYPE_SWITCH( - this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_, - DTypeX, { - test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0)); - }); - MSHADOW_TYPE_SWITCH( - this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_, - DTypeX, { - test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));}); - - for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) { - const int dtype = this->c_.blob_output_vec_[i].type_flag_; - MSHADOW_TYPE_SWITCH(dtype, DTypeX, - { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); }); - } - } - - void resetBackward() override { - DType val = -.001; - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_, - DTypeX, { - test::patternFill(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut], - [&val]{ return val += 1; }); - }); - - // out-grad weights - if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); }); - } - - // out-grad biases - if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); }); - } - - // in-grad - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); }); - - // in-grad weights - if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); }); - } - - // in-grad biases - if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) { - MSHADOW_TYPE_SWITCH( - this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_, - DTypeX, - { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); }); - } - } - - const bool hasWeightAndBias_; // This will cause forward pass validation to fail -}; - static const test::op::kwargs_t blank_kwargs; static const test::op::kwargs_t blank_kwargs_nocudnn = { {"cudnn_off", "True"} }; @@ -424,7 +455,7 @@ static const test::op::kwargs_t nfs_ugs_kwargs_nocudnn = { #if !DISABLE_VALIDATION static bool isUGS(const test::op::kwargs_t& kwargs) { for (test::op::kwargs_t::const_iterator i = kwargs.begin(), - e = kwargs.end(); i != e; ++i) { + e = kwargs.end(); i != e; ++i) { if (!i->first.compare("use_global_stats")) { return i->second.compare("True") == 0; } @@ -433,50 +464,48 @@ static bool isUGS(const test::op::kwargs_t& kwargs) { } #endif // DISABLE_VALIDATION -template -static StreamType& PRT( - StreamType *os, - const test::op::BasicOperatorData& obj, - const typename test::op::BasicOperatorData::BlobVectorType bvt, - const size_t idx) { - *os << test::op::BasicOperatorData::bvt2String(bvt) << ": " << idx +template +static StreamType& PRT(StreamType *os, const OperatorExecutor& obj, + const typename OperatorExecutor::BlobVectorType bvt, const size_t idx) { + *os << OperatorExecutor::bvt2String(bvt) << ": " << idx << ": "; const TBlob& blob = obj.getBlobVect(bvt)[idx]; - test::print(os, blob); + + test::print(RunContext(), os, blob); return *os; } -template +template static StreamType& dumpF(StreamType *os, - const test::op::OpInfo& prop, + const test::op::OpInfo& prop, const size_t x = 0) { - if (test::debugOutput) { + if (test::debug_output) { *os << std::endl; if (x) { *os << "=============================" << std::endl; *os << "= " << x << std::endl; *os << "=============================" << std::endl; } - typedef typename test::op::BasicOperatorData::BlobVectorType BlobVectorType; - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kData); - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kGamma); - PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kBeta); + typedef typename OperatorExecutor::BlobVectorType BlobVectorType; + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kData); + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kGamma); + PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kBeta); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kOut); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kMean); - PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kVar); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kOut); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kMean); + PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kVar); } return *os; } -template +template static StreamType& dumpB(StreamType *os, - const test::op::OpInfo& prop, + const test::op::OpInfo& prop, const size_t x = 0) { - if (test::debugOutput) { + if (test::debug_output) { *os << std::endl; if (x) { *os << "=============================" << std::endl; @@ -484,34 +513,34 @@ static StreamType& dumpB(StreamType *os, *os << "=============================" << std::endl; } - typedef typename test::op::BasicOperatorData::BlobVectorType BlobVectorType; - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kData); - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kGamma); - PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kBeta); + typedef typename OperatorExecutor::BlobVectorType BlobVectorType; + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kData); + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kGamma); + PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kBeta); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean); - PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean); + PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar); - PRT(os, *prop.data_, BlobVectorType::kOutGrad, op::batchnorm::kOut); + PRT(os, *prop.executor_, BlobVectorType::kOutGrad, op::batchnorm::kOut); } return *os; } -template +template static StreamType& dumpF(StreamType *os, - const test::op::OpInfoPair& bi) { + const test::op::OpInfoPair& bi) { return dumpF(&dumpF(os, bi.info_1_, 1), bi.info_2_, 2); } -template +template static StreamType& dumpB(StreamType *os, - const test::op::OpInfoPair& bi) { + const test::op::OpInfoPair& bi) { return dumpB(&dumpB(os, bi.info_1_, 1), bi.info_2_, 2); } /*! \brief Test batch norm operator forward pass */ -template -static test::op::OpInfo TestBatchNormOperatorForward( +template +static test::op::OpInfo TestBatchNormOperatorForward( bool isGPU, const TShape& inputShape, const std::vector >& kwargs, @@ -524,16 +553,17 @@ static test::op::OpInfo TestBatchNormOperatorForwa isGPU = false; #endif - test::op::OpInfo info = test::op::createOpAndInfoF< - OperatorProp, BNOperatorData, DType, AccReal>(isGPU, inputShape, kwargs); + test::op::OpInfo info = test::op::createOpAndInfoF< + OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShape); - info.data_->initForward(*info.prop_, &info.in_type_); + info.executor_->initForward(*info.prop_, &info.in_type_); - info.data_->forward(count); + info.executor_->forward(count); #if !DISABLE_VALIDATION if (!isUGS(kwargs)) { - BatchNormValidator::validateForward(*info.data_); + BatchNormValidator::validateForward(*info.executor_); } #endif @@ -541,20 +571,20 @@ static test::op::OpInfo TestBatchNormOperatorForwa } /*! \brief Test batch norm operator backward pass */ -template -static test::op::OpInfo runOperatorBackward( - test::op::OpInfo *info, +template +static test::op::OpInfo runOperatorBackward( + test::op::OpInfo *info, const size_t count = 1) { - info->data_->initBackward(*info->prop_, &info->in_type_); + info->executor_->initBackward(*info->prop_, &info->in_type_); - info->data_->backward(count); + info->executor_->backward(count); return *info; } static constexpr size_t CYCLE_COUNT = 3; -template -static test::op::OpInfoPair testForwardAndBackward( +template +static test::op::OpInfoPair testForwardAndBackward( const bool isGPU1, const bool isGPU2, const TShape &inputShape, @@ -562,22 +592,25 @@ static test::op::OpInfoPair testFo const bool dumpC, const size_t count = 1, const size_t cycleCount = CYCLE_COUNT) { - test::op::OpInfo info_1 = - TestBatchNormOperatorForward(isGPU1, inputShape, - kwargs, count); + test::op::OpInfo info_1 = + TestBatchNormOperatorForward(isGPU1, inputShape, + kwargs, count); - test::op::OpInfo info_2 = - TestBatchNormOperatorForward(isGPU2, inputShape, - kwargs, count); + test::op::OpInfo info_2 = + TestBatchNormOperatorForward(isGPU2, inputShape, + kwargs, count); size_t thisCount = 0; + typedef typename OperatorExecutor::DataType DType; + typedef typename OperatorExecutor::AccRealType AccReal; + do { const bool isLast = thisCount == cycleCount - 1; if (thisCount) { - info_1.data_->forward(count); - info_2.data_->forward(count); + info_1.executor_->forward(count); + info_2.executor_->forward(count); } if (isLast) { @@ -588,18 +621,18 @@ static test::op::OpInfoPair testFo // Check that everything is the same after the forward pass BatchNormValidator::compare(info_1, info_2); - test::op::Validator::compare( - *info_1.data_, *info_2.data_, - test::op::BasicOperatorData::kInput, - op::batchnorm::kData); + BatchNormValidator::compare( + *info_1.executor_, *info_2.executor_, + test::op::LegacyOperatorExecutor::kInput, + op::batchnorm::kData, false); if (!thisCount) { // return backward runOperatorBackward(&info_1, count); runOperatorBackward(&info_2, count); } else { - info_1.data_->backward(count); - info_2.data_->backward(count); + info_1.executor_->backward(count); + info_2.executor_->backward(count); } if (isLast) { @@ -612,14 +645,13 @@ static test::op::OpInfoPair testFo } while (++thisCount < cycleCount); if (dumpC) { - info_1.data_->dumpC(&std::cerr, "BN_testForwardAndBackward"); + info_1.executor_->dumpC(&std::cerr, "BN_testForwardAndBackward"); } return { info_1, info_2 }; } - -template -static test::op::OpInfoPair +template +static test::op::OpInfoPair testForwardAndBackward(const bool isGPU, const TShape &inputShape, const test::op::kwargs_t kwargs, @@ -627,7 +659,7 @@ testForwardAndBackward(const bool isGPU, const size_t count = 1, const size_t cycleCount = CYCLE_COUNT ) { - return testForwardAndBackward( + return testForwardAndBackward( isGPU, isGPU, inputShape, @@ -637,14 +669,14 @@ testForwardAndBackward(const bool isGPU, cycleCount); } -template -static test::op::OpInfoPair +template +static test::op::OpInfoPair testBNForwardAndBackward2D(const bool isGPU, - const TShape &inputShape, - const test::op::kwargs_t kwargs, - const bool dumpC = false) { + const TShape &inputShape, + const test::op::kwargs_t kwargs, + const bool dumpC = false) { CHECK_EQ(inputShape.ndim(), 4); // V1 can only handle 2D - return testForwardAndBackward( + return testForwardAndBackward( isGPU, isGPU, inputShape, @@ -661,7 +693,7 @@ TEST(BATCH_NORM, Test2DForwardV1V2) { DType, AccReal, { - auto infoA = testBNForwardAndBackward2D( + auto infoA = testBNForwardAndBackward2D>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); }); } @@ -675,14 +707,14 @@ TEST(BATCH_NORM, Test1DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs); }); } } TEST(BATCH_NORM, Test2DForwardV1) { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); @@ -693,7 +725,8 @@ TEST(BATCH_NORM, Test2DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - auto opInfoFloatH = TestBatchNormOperatorForward( + auto opInfoFloatH = TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); }); } @@ -704,13 +737,13 @@ TEST(BATCH_NORM, Test3DForward) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs); }); } } -template +template static void timingTest(const std::string& label, const bool isGPU, const bool stochastic, @@ -751,22 +784,22 @@ static void timingTest(const std::string& label, const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U); - test::op::OpInfo info; + test::op::OpInfo info; switch (D) { case 0: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, width}, kwargs, count); break; case 1: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, height, width}, kwargs, count); break; case 2: - info = TestBatchNormOperatorForward( + info = TestBatchNormOperatorForward( isGPU, {batchSize, channels, depth, height, width}, kwargs, count); @@ -774,9 +807,9 @@ static void timingTest(const std::string& label, default: CHECK(false) << "rangedRand() returned unexpected value"; } - if (info.data_.get()) { - runOperatorBackward(&info, count); - timing += info.data_->timing_; + if (info.executor_.get()) { + runOperatorBackward(&info, count); + timing += info.executor_->GetTiming(); } } while (false); @@ -795,19 +828,17 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - timingTest("RANDOM: BatchNormProp", - false, true, - blank_kwargs_nocudnn, - GPU_TEST_DIMENSIONS); }); + timingTest>( + "RANDOM: BatchNormProp", false, true, + blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); }); #if MXNET_USE_CUDA if (test::unitTestsWithCuda) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - timingTest("RANDOM: BatchNormProp", - true, true, - blank_kwargs_nocudnn, - GPU_TEST_DIMENSIONS); }); + timingTest>( + "RANDOM: BatchNormProp", true, true, + blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); }); } #endif } @@ -822,43 +853,48 @@ TEST(BATCH_NORM, TestTiming_2D) { if (mxnet::test::quick_test) { THISCOUNT = 1; } - MSHADOW_REAL_TYPE_SWITCH_EX( - mshadow::kFloat32, DType, AccReal, - { - timingTest("BatchNormV1Prop 2D", - false, false, - blank_kwargs, - 2, THISCOUNT); +MSHADOW_REAL_TYPE_SWITCH_EX( + mshadow::kFloat32, DType, AccReal, { + timingTest>( + "BatchNormV1Prop 2D", + false, false, + blank_kwargs, + 2, THISCOUNT); #if MXNET_USE_MKL2017 == 1 - timingTest("MKL BatchNormProp 2D", - false, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); + timingTest>( + "MKL BatchNormProp 2D", + false, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); #endif - test::ScopeSet disableMKL(&mxnet::op::batchnorm::disable_mkl, true); - timingTest("BatchNormProp 2D", - false, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); + test::ScopeSet disableMKL(&mxnet::op::batchnorm::disable_mkl, true); + timingTest>( + "BatchNormProp 2D", + false, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); #if MXNET_USE_CUDA - if (test::unitTestsWithCuda) { - timingTest("BatchNormV1Prop 2D", - true, false, - blank_kwargs, - 2, THISCOUNT); - timingTest("BatchNormProp 2D", - true, false, - blank_kwargs_nocudnn, - 2, THISCOUNT); + if (test::unitTestsWithCuda) { + timingTest>( + "BatchNormV1Prop 2D", + true, false, + blank_kwargs, + 2, THISCOUNT); + timingTest>( + "BatchNormProp 2D", + true, false, + blank_kwargs_nocudnn, + 2, THISCOUNT); #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - timingTest("CUDNN BatchNormProp 2D", - true, false, - blank_kwargs, - 2, THISCOUNT); + timingTest>( + "CUDNN BatchNormProp 2D", + true, false, + blank_kwargs, + 2, THISCOUNT); #endif - } + } #endif - }); +}); } /** @@ -867,8 +903,8 @@ TEST(BATCH_NORM, TestTiming_2D) { template struct BothInfo { - test::op::OpInfo info_v1_; - test::op::OpInfo info_; + test::op::OpInfo> info_v1_; + test::op::OpInfo> info_; }; TEST(BATCH_NORM, TestBackward2D_Simple) { @@ -876,8 +912,10 @@ TEST(BATCH_NORM, TestBackward2D_Simple) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair< + op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor> bi = + testForwardAndBackward< + op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor>( false, inputShape, blank_kwargs); // Keep it simple }); } @@ -905,14 +943,16 @@ TEST(BATCH_NORM, TestIterAll) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - test::op::OpInfoPair + test::op::OpInfoPair> bi = testForwardAndBackward( + BNOperatorExecutor>( g1 != 0, g2 != 0, shape, kwargs, false); // Keep it simple if (shape.ndim() == 4 && type == mshadow::kFloat32 && !x3) { - test::op::OpInfoPair + test::op::OpInfoPair> bi = testForwardAndBackward( + BNOperatorExecutor>( g1 != 0, g2 != 0, shape, kwargs, false); // Keep it simple } }); @@ -942,21 +982,21 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) TShape shapes[2] = {2, 3}; const TShape inputShape({2, 3}); - test::op::OpInfo info_1 = test::op::createOpAndInfoF< - op::BatchNormV1Prop, - BNOperatorData, - DType, AccReal>(gpu_V1, inputShape, kwargs); + test::op::OpInfo> info_1 = + test::op::createOpAndInfoF< + op::BatchNormV1Prop, BNOperatorExecutor>( + kwargs, gpu_V1, inputShape); - test::op::OpInfo info_2 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - gpu_V2, inputShape, kwargs); + test::op::OpInfo> info_2 = + test::op::createOpAndInfoF>( + kwargs, gpu_V2, inputShape); - info_1.data_->initForward(*info_1.prop_, &info_1.in_type_); - info_2.data_->initForward(*info_1.prop_, &info_1.in_type_); - info_1.data_->initBackward(*info_1.prop_, &info_1.in_type_); - info_2.data_->initBackward(*info_1.prop_, &info_1.in_type_); + info_1.executor_->initForward(*info_1.prop_, &info_1.in_type_); + info_2.executor_->initForward(*info_1.prop_, &info_1.in_type_); + info_1.executor_->initBackward(*info_1.prop_, &info_1.in_type_); + info_2.executor_->initBackward(*info_1.prop_, &info_1.in_type_); - TBlob &blob1 = info_1.data_->c_.blob_input_vec_[op::batchnorm::kData]; + TBlob &blob1 = info_1.executor_->inputs()[op::batchnorm::kData]; test::data_ref(&blob1, {0, 0}) = -0.05f; test::data_ref(&blob1, {0, 1}) = -0.19f; test::data_ref(&blob1, {0, 2}) = 0.02f; @@ -964,7 +1004,7 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) test::data_ref(&blob1, {1, 1}) = 0.06f; test::data_ref(&blob1, {1, 2}) = -0.01f; - TBlob &blob2 = info_2.data_->c_.blob_input_vec_[op::batchnorm::kData]; + TBlob &blob2 = info_2.executor_->inputs()[op::batchnorm::kData]; test::data_ref(&blob2, {0, 0}) = -0.05f; test::data_ref(&blob2, {0, 1}) = -0.19f; test::data_ref(&blob2, {0, 2}) = 0.02f; @@ -972,20 +1012,20 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count) test::data_ref(&blob2, {1, 1}) = 0.06f; test::data_ref(&blob2, {1, 2}) = -0.01f; - test::data_ref(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3; - test::data_ref(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3; + test::data_ref(&info_1.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3; + test::data_ref(&info_2.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3; - test::data_ref(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3; - test::data_ref(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3; + test::data_ref(&info_1.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3; + test::data_ref(&info_2.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3; for (size_t x = 0; x < count; ++x) { - info_1.data_->forward(); - info_2.data_->forward(); + info_1.executor_->forward(); + info_2.executor_->forward(); BatchNormValidator::compare(info_1, info_2); - info_1.data_->backward(); - info_2.data_->backward(); + info_1.executor_->backward(); + info_2.executor_->backward(); BatchNormValidator::compare(info_1, info_2); } @@ -1009,8 +1049,10 @@ TEST(BATCH_NORM, TestBackward2D_SimpleNFG) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, nonfixgamma_kwargs); }); } @@ -1019,10 +1061,12 @@ TEST(BATCH_NORM, Test2DBackward_Complex) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - test::ScopeSet noDebugOutput(&test::debugOutput, false); + test::ScopeSet noDebugOutput(&test::debug_output, false); const TShape inputShape({9, 14, 16, 91}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, blank_kwargs); }); } @@ -1031,12 +1075,14 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil { template static void test() { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, inputShape, blank_kwargs, false, 1, 5); #if MXNET_DUMP_C - bi.info_1_.data_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic"); + bi.info_1_.executor_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic"); #endif static const std::vector< std::vector< std::vector > > @@ -1067,17 +1113,17 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil { // Expected data state when running forward+backward starting with default values // Note: This data structure generated by dumpC() // Test loaded data agsinst calculated data - test::op::OpInfo info_checkLoad = - test::op::createOpAndInfoF, - DType, AccReal>(false, inputShape, blank_kwargs); - info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->load(___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___); + test::op::OpInfo> info_checkLoad = + test::op::createOpAndInfoF>( + blank_kwargs, false, inputShape); + info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->load( + ___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___); BatchNormValidator::compare(bi.info_1_, info_checkLoad); } }; - TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) { test::ScopeSet disableMKL(&mxnet::op::batchnorm::disable_mkl, true); MSHADOW_REAL_TYPE_SWITCH_EX( @@ -1087,17 +1133,20 @@ TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) { }); } -template +template void compare(const bool isGPU, - const test::op::OpInfo& object, - const std::vector< std::vector< std::vector > >& values) { - test::op::OpInfo info_checkLoad = - test::op::createOpAndInfoF, DType, AccReal>( - isGPU, object.data_->c_.blob_input_vec_[0].shape_, blank_kwargs); - info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); - info_checkLoad.data_->load(values); - BatchNormValidator::compare(object, info_checkLoad); + const test::op::OpInfo& object, + const std::vector< + std::vector< std::vector > >& values) { + test::op::OpInfo info_checkLoad = + test::op::createOpAndInfoF( + blank_kwargs, isGPU, object.executor_->inputs()[0].shape_); + info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_); + info_checkLoad.executor_->load(values); + BatchNormValidator< + typename OperatorExecutor::DataType, + typename OperatorExecutor::AccRealType>::compare(object, info_checkLoad); } TEST(BATCH_NORM, TestBackward1D_Simple) { @@ -1105,44 +1154,43 @@ TEST(BATCH_NORM, TestBackward1D_Simple) { mshadow::kFloat32, DTypeX, AccReal, { const TShape inputShape({1, 1, 2}); - test::op::OpInfo info = - TestBatchNormOperatorForward(false, - inputShape, - blank_kwargs); - info.data_->initBackward(*info.prop_, &info.in_type_); + test::op::OpInfo> info = + TestBatchNormOperatorForward>( + false, inputShape, blank_kwargs); + info.executor_->initBackward(*info.prop_, &info.in_type_); runOperatorBackward(&info); #if MXNET_DUMP_C - info.data_->dumpC(&std::cerr, "BN_TestBackward1D_Simple"); + info.executor_->dumpC(&std::cerr, "BN_TestBackward1D_Simple"); #endif // Expected data state when running forward+backward starting with default values // Note: This data structure generated by dumpC() static const std::vector< std::vector< std::vector > > ___BN_TestBackward1D_Simple_data_shape_1_1_2___ = { - { /* kInput */ - { 1.0f, 2.0f }, - { 1.0f }, - { 0.0f } - }, - { /* kOutput */ - { -0.998006f, 0.998006f }, - { 1.5f }, - { 0.25f } - }, - { /* kAux */ - { 0.15f }, - { 0.925f } - }, - { /* kInGrad */ - { -0.00397621f, 0.00397609f }, - { 0.0f }, - { 2.998f } - }, - { /* kOutGrad */ - { 0.999f, 1.999f } - } - }; + { /* kInput */ + { 1.0f, 2.0f }, + { 1.0f }, + { 0.0f } + }, + { /* kOutput */ + { -0.998006f, 0.998006f }, + { 1.5f }, + { 0.25f } + }, + { /* kAux */ + { 0.15f }, + { 0.925f } + }, + { /* kInGrad */ + { -0.00397621f, 0.00397609f }, + { 0.0f }, + { 2.998f } + }, + { /* kOutGrad */ + { 0.999f, 1.999f } + } + }; compare(false, info, ___BN_TestBackward1D_Simple_data_shape_1_1_2___); }); } @@ -1152,13 +1200,13 @@ TEST(BATCH_NORM, TestBackward3D) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({2, 3, 2, 3, 5}); - test::op::OpInfo info = - TestBatchNormOperatorForward( + test::op::OpInfo> info = + TestBatchNormOperatorForward>( false, inputShape, blank_kwargs); - info.data_->initBackward(*info.prop_, &info.in_type_); + info.executor_->initBackward(*info.prop_, &info.in_type_); runOperatorBackward(&info); #if MXNET_DUMP_C - info.data_->dumpC(&std::cerr, "TestBackward3D"); + info.executor_->dumpC(&std::cerr, "TestBackward3D"); #endif }); } @@ -1169,8 +1217,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_nfg) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, nonfixgamma_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1183,8 +1233,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_ugs) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, useglobalstats_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1208,7 +1260,7 @@ class ChannelAxisTestData { std::vector indexes(channel_count, 0); for (size_t outer = 0, outerCount = tensor3.OuterSize(); outer < outerCount; ++outer) { for (size_t channel = 0, channelCount = tensor3.ChannelCount(); - channel < channelCount; ++channel) { + channel < channelCount; ++channel) { CHECK_LT(channel, channel_data_.size()); for (size_t inner = 0, innerCount = tensor3.InnerSize(); inner < innerCount; ++inner) { CHECK_LT(indexes[channel], channel_data_[channel].size()); @@ -1226,7 +1278,7 @@ class ChannelAxisTestData { std::vector> channel_data_; static void print(const std::string& label, const std::vector>& m) { - if (test::debugOutput) { + if (test::debug_output) { if (!label.empty()) { std::cout << label << ": "; } @@ -1248,7 +1300,7 @@ class ChannelAxisTestData { } static void print(const std::string& label, const TBlob& blob) { - if (test::debugOutput) { + if (test::debug_output) { if (!label.empty()) { std::cout << label << ": "; } @@ -1266,7 +1318,7 @@ class ChannelAxisTestData { } void save(const TBlob& blob, const int channel_axis) { - loadOrSave(blob, channel_axis, SAVE); + loadOrSave(blob, channel_axis, SAVE); } void load(const TBlob& blob, const int channel_axis) { @@ -1281,8 +1333,8 @@ static void compare(const TBlob& blob, const std::vector& vals) { for (size_t i = 0, n = vals.size(); i < n; ++i) { const DType vBlob = v[i]; const DType vVect = vals[i]; - const bool near = test::op::Validator::isNear( - vBlob, vVect, test::op::Validator::ErrorBound(&blob)); + const bool near = BatchNormValidator::isNear( + vBlob, vVect, BatchNormValidator::ErrorBound(&blob)); EXPECT_TRUE(near); if (!near) { LOG(WARNING) << vBlob << " is not near enough to " << vVect << std::endl; @@ -1301,8 +1353,8 @@ static void compare(const std::vector>& d1, for (size_t i = 0, n = vec1.size(); i < n; ++i) { const DType v1 = vec1[i]; const DType v2 = vec2[i]; - const bool near = test::op::Validator::isNear( - v1, v2, test::op::Validator::ERROR_BOUND()); + const bool near = BatchNormValidator::isNear( + v1, v2, BatchNormValidator::ERROR_BOUND()); EXPECT_TRUE(near); if (!near) { LOG(WARNING) << v1 << " is not near enough to " << v2 << std::endl; @@ -1364,7 +1416,7 @@ TEST(BATCH_NORM, TestChannelAxisSaveAndLoad) { /*! \brief Insert the channel field `channelCount` into the shape at `channelAxis` position */ static TShape MakeShape(const std::vector& shape, - unsigned int channelAxis, + signed int channelAxis, const size_t channelCount) { if (channelAxis < 0) { channelAxis += shape.size() + 1; @@ -1442,66 +1494,67 @@ static void runChannelAxisTest( // Create operator 1 with ChannelAxis2 (normally the experimental one) kwargs.push_back({"axis", std::to_string(channelAxis1)}); - test::op::OpInfo info_c1 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - isGPU1, shape_c1, kwargs); + test::op::OpInfo> info_c1 = + test::op::createOpAndInfoF< + op::BatchNormProp, BNOperatorExecutor>( + kwargs, isGPU1, shape_c1); // Create operator 2 with ChannelAxis2 (normally the control one) kwargs.pop_back(); kwargs.push_back({"axis", std::to_string(channelAxis2)}); - test::op::OpInfo info_c2 = test::op::createOpAndInfoF< - op::BatchNormProp, BNOperatorData, DType, AccReal>( - isGPU2, shape_c2, kwargs); + test::op::OpInfo> info_c2 = + test::op::createOpAndInfoF>( + kwargs, isGPU2, shape_c2); kwargs.pop_back(); // Init operators - info_c1.data_->initForward(*info_c1.prop_, &info_c1.in_type_); - info_c1.data_->initBackward(*info_c1.prop_, &info_c1.in_type_); - info_c2.data_->initForward(*info_c2.prop_, &info_c2.in_type_); - info_c2.data_->initBackward(*info_c2.prop_, &info_c2.in_type_); + info_c1.executor_->initForward(*info_c1.prop_, &info_c1.in_type_); + info_c1.executor_->initBackward(*info_c1.prop_, &info_c1.in_type_); + info_c2.executor_->initForward(*info_c2.prop_, &info_c2.in_type_); + info_c2.executor_->initBackward(*info_c2.prop_, &info_c2.in_type_); // Save input data to blob with new shape 1 - data_c1.save(info_c1.data_->c_.blob_input_vec_[0], channelAxis1); - ChannelAxisTestData::print("blob 1 input", info_c1.data_->c_.blob_input_vec_[0]); + data_c1.save(info_c1.executor_->inputs()[0], channelAxis1); + ChannelAxisTestData::print("blob 1 input", info_c1.executor_->inputs()[0]); // Save input data to blob with new shape 2 - data_c2.save(info_c2.data_->c_.blob_input_vec_[0], channelAxis2); - ChannelAxisTestData::print("blob 2 input", info_c2.data_->c_.blob_input_vec_[0]); + data_c2.save(info_c2.executor_->inputs()[0], channelAxis2); + ChannelAxisTestData::print("blob 2 input", info_c2.executor_->inputs()[0]); // Save output grad to blob with new shape 1 - grad_c1.save(info_c1.data_->c_.blob_out_grad_[0], channelAxis1); - ChannelAxisTestData::print("blob 1 output grad", info_c1.data_->c_.blob_out_grad_[0]); + grad_c1.save(info_c1.executor_->bwd_inputs()[0], channelAxis1); + ChannelAxisTestData::print("blob 1 output grad", info_c1.executor_->bwd_inputs()[0]); // Save output grad to blob with new shape 2 - grad_c2.save(info_c2.data_->c_.blob_out_grad_[0], channelAxis2); - ChannelAxisTestData::print("blob 2 output grad", info_c2.data_->c_.blob_out_grad_[0]); + grad_c2.save(info_c2.executor_->bwd_inputs()[0], channelAxis2); + ChannelAxisTestData::print("blob 2 output grad", info_c2.executor_->bwd_inputs()[0]); // Run both operators forward and backwards several times for (index_t x = 0; x < numberOfPasses; ++x) { - info_c1.data_->forward(); - info_c2.data_->forward(); + info_c1.executor_->forward(); + info_c2.executor_->forward(); - info_c1.data_->backward(); - info_c2.data_->backward(); + info_c1.executor_->backward(); + info_c2.executor_->backward(); } // Transform operator 1's blob output to a normalized shape - data_c1.load(info_c1.data_->c_.blob_output_vec_[0], channelAxis1); + data_c1.load(info_c1.executor_->outputs()[0], channelAxis1); ChannelAxisTestData::print("channel data 1", data_c1.channel_data_); // Transform operator 2's blob output to a normalized shape - data_c2.load(info_c2.data_->c_.blob_output_vec_[0], channelAxis2); + data_c2.load(info_c2.executor_->outputs()[0], channelAxis2); ChannelAxisTestData::print("channel data 2", data_c2.channel_data_); // Compare the operators' output data while they're in a normalized shape compare(data_c1.channel_data_, data_c2.channel_data_); // Transform operator 1's input-grad blob to a normalized shape - grad_c1.load(info_c1.data_->c_.blob_in_grad_[0], channelAxis1); + grad_c1.load(info_c1.executor_->bwd_outputs()[0], channelAxis1); ChannelAxisTestData::print("input grad 1", grad_c1.channel_data_); // Transform operator 2's input-grad blob to a normalized shape - grad_c2.load(info_c2.data_->c_.blob_in_grad_[0], channelAxis2); + grad_c2.load(info_c2.executor_->bwd_outputs()[0], channelAxis2); ChannelAxisTestData::print("input grad 2", grad_c2.channel_data_); // Compare the operators' input grad data while they're in a normalized shape @@ -1533,12 +1586,15 @@ TEST(BATCH_NORM, TestChannelAxisSimple) { * backward result equivalence here implies correctness for other channel positions */ TEST(BATCH_NORM, TestChannelAxis) { - test::ScopeSet noDebugOutput(&test::debugOutput, false); + test::ScopeSet noDebugOutput(&test::debug_output, false); test::op::kwargs_t kwargs; const std::vector> shapes = - { {1, 2}, {1, 2, 1}, {1, 2, 3}, {1, 2, 3, 4} }; - const char *tof[2] = { "False", "True" }; + {{1, 2}, + {1, 2, 1}, + {1, 2, 3}, + {1, 2, 3, 4}}; + const char *tof[2] = {"False", "True"}; for (size_t x1 = 0; x1 < 2U; ++x1) { kwargs.push_back({"fix_gamma", tof[x1]}); @@ -1576,11 +1632,11 @@ TEST(BATCH_NORM, Test2DForwardV12D_gpu) { MSHADOW_REAL_TYPE_SWITCH_EX( mshadow::kFloat32, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); @@ -1592,11 +1648,11 @@ TEST(BATCH_NORM, Test2DForward2D_gpu) { MSHADOW_REAL_TYPE_SWITCH_EX( type, DType, AccReal, { - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs); - TestBatchNormOperatorForward( + TestBatchNormOperatorForward>( true, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs_nocudnn); @@ -1610,7 +1666,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1_gpu_cpu) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); }); } @@ -1620,7 +1677,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1Complex_gpu_cpu) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); }); } @@ -1631,9 +1689,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu) { type, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs_nocudnn, false); }); } @@ -1645,9 +1705,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, blank_kwargs_nocudnn, false); }); } @@ -1661,7 +1723,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_nfg) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, false, inputShape, nonfixgamma_kwargs, false); }); } @@ -1672,9 +1735,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_nfg) { type, DType, AccReal, { const TShape inputShape({1, 1, 2, 1}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs_nocudnn, false); }); } @@ -1686,9 +1751,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_nfg) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, nonfixgamma_kwargs_nocudnn, false); }); } @@ -1702,8 +1769,10 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_ugs) { mshadow::kFloat32, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - test::op::OpInfoPair bi = - testForwardAndBackward( + test::op::OpInfoPair> bi = + testForwardAndBackward>( false, false, inputShape, useglobalstats_kwargs, false); dumpF(&std::cout, bi); dumpB(&std::cout, bi); @@ -1716,9 +1785,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_ugs) { type, DType, AccReal, { const TShape inputShape({2, 3, 2, 2}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs_nocudnn, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs, false); }); } @@ -1730,13 +1801,14 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) { type, DType, AccReal, { const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW}); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs, false); - testForwardAndBackward( + testForwardAndBackward>( false, true, inputShape, useglobalstats_kwargs_nocudnn, false); }); } } #endif // MXNET_USE_CUDA - diff --git a/tests/cpp/operator/core_op_runner_test.cc b/tests/cpp/operator/core_op_runner_test.cc index cf7f0b9f39e7..6cc2baddae28 100644 --- a/tests/cpp/operator/core_op_runner_test.cc +++ b/tests/cpp/operator/core_op_runner_test.cc @@ -38,21 +38,6 @@ using kwargs_t = test::op::kwargs_t; static const kwargs_t basic_args = {}; -<<<<<<< HEAD -static const std::vector> test_unary_operators = - { - {"relu", "" }, // Code can figure out what the backward op is for some - {"sigmoid", "" }, - { "sqrt", "" } - }; - - -static const std::vector> test_binary_operators = - { - {"elemwise_add", "_backward_add"}, - {"elemwise_mul", "_backward_mul"} - }; -======= static const std::vector> test_unary_operators = { { "relu", "" }, // Code can figure out what the backward op is for some { "sigmoid", "" }, @@ -68,16 +53,11 @@ template inline std::vector AsVect(const TT& t) { return std::move(std::vector({ t })); } ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 /*! * \brief Generic bidirectional sanity test for simple unary op */ -<<<<<<< HEAD -TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnary) { -======= TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 TShape shape({5, 5}); kwargs_t kwargs = basic_args; @@ -85,11 +65,7 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) { const char *op_name = i.first.c_str(); const char *backward_op_name = i.second.c_str(); -<<<<<<< HEAD - test::op::CoreOpExecutor op(false, shape); -======= test::op::CoreOpExecutor op(false, AsVect(shape)); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 op.set_verbose(false); op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name)); @@ -109,11 +85,7 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) { /*! * \brief Generic bidirectional sanity test for binary op */ -<<<<<<< HEAD -TEST(CORE_OP_RUNNER, ExecuteBidirectional) { -======= TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 for (const std::pair& i : test_binary_operators) { const char *op_name = i.first.c_str(); const char *backward_op_name = i.second.c_str(); @@ -121,11 +93,7 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) { TShape shape({5, 5}); kwargs_t kwargs = basic_args; -<<<<<<< HEAD - test::op::CoreOpExecutor op(false, shape); -======= test::op::CoreOpExecutor op(false, AsVect(shape)); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 op.set_verbose(false); op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name)); @@ -142,8 +110,6 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) { } } -<<<<<<< HEAD -======= /*! * \brief Execute bidirectional dot product, which has different shaped inputs and outputs */ @@ -169,7 +135,6 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalDotProduct) { PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); } ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerSimpleUnary) { typedef float DType; TShape shape({5, 5}); @@ -177,11 +142,7 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerSimpleUnary) { const char *op_name = i.first.c_str(); const char *backward_op_name = i.second.c_str(); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, shape, test::op::CoreOpExecutor::ArgsWithOpName( -======= runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 basic_args, op_name, backward_op_name), 1); } } @@ -193,18 +154,12 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunner) { const char *op_name = i.first.c_str(); const char *backward_op_name = i.second.c_str(); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, shape, test::op::CoreOpExecutor::ArgsWithOpName( -======= runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 basic_args, op_name, backward_op_name), 1); } } /*! -<<<<<<< HEAD -======= * \brief Test RunBidirectional dot product, which has different shaped inputs and outputs */ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerDotProduct) { @@ -221,7 +176,6 @@ TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerDotProduct) { } /*! ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 * \brief Timing tests for CPU */ TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) { @@ -232,13 +186,7 @@ TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) { const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName(basic_args, op_name); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {10, 10, 10, 10}, - kwargs, - 1); // prime code and cache -======= runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes; if (test::performance_run) { @@ -256,12 +204,8 @@ TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) { }; } for (const TShape &shape : shapes) { -<<<<<<< HEAD - runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, shape); -======= runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, { shape }); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } } @@ -275,13 +219,7 @@ TEST(CORE_OP_RUNNER, TimingCPUBinary) { basic_args, op_name, backward_op_name); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {10, 10, 10, 10}, - kwargs, - 1); // prime code and cache -======= runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes; if (test::performance_run) { @@ -299,12 +237,6 @@ TEST(CORE_OP_RUNNER, TimingCPUBinary) { }; } for (const TShape &shape : shapes) { -<<<<<<< HEAD - runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, shape); - } -} - -======= runner.TimingTest(std::string(op_name) + "Operator CPU", false, false, kwargs, 2, 10, { shape }); } @@ -339,7 +271,6 @@ TEST(CORE_OP_RUNNER, TimingCPUBinaryDotProduct) { false, kwargs, 2, 10, input_shapes); } } ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 #if MXNET_USE_CUDA == 1 TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) { typedef float DType; @@ -349,12 +280,8 @@ TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) { const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName(basic_args, op_name); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {10, 10, 10, 10}, -======= runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 kwargs, 1); // prime code and cache @@ -374,11 +301,7 @@ TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) { }; } for (const TShape &shape : shapes) { -<<<<<<< HEAD - runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, shape); -======= runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape }); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 }} TEST(CORE_OP_RUNNER, TimingGPUBinary) { @@ -391,12 +314,8 @@ TEST(CORE_OP_RUNNER, TimingGPUBinary) { basic_args, op_name, backward_op_name); test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(true, {10, 10, 10, 10}, -======= runner.RunBidirectional(true, { TShape({10, 10, 10, 10}) }, ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 kwargs, 1); // prime code and cache @@ -416,11 +335,7 @@ TEST(CORE_OP_RUNNER, TimingGPUBinary) { }; } for (const TShape &shape : shapes) { -<<<<<<< HEAD - runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, shape); -======= runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape }); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } } diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc index 36e17006a533..2655740677e2 100644 --- a/tests/cpp/operator/coreop_perf.cc +++ b/tests/cpp/operator/coreop_perf.cc @@ -39,11 +39,7 @@ static void RunCoreOpBidirectional(const bool isGPU, const char *op_name, const char *backward_op_name = "") { const TShape shape({5, 5}); -<<<<<<< HEAD - test::op::CoreOpExecutor op(isGPU, shape); -======= test::op::CoreOpExecutor op(isGPU, { shape }); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 op.set_verbose(false); op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name)); @@ -52,11 +48,7 @@ static void RunCoreOpBidirectional(const bool isGPU, PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); op.Execute(); PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); -<<<<<<< HEAD - if(op.HasBackward()) { -======= if (op.HasBackward()) { ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); op.ExecuteBackward(); @@ -64,10 +56,6 @@ static void RunCoreOpBidirectional(const bool isGPU, } } -<<<<<<< HEAD - -======= ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 template static void RunCoreOpTimingTest(const bool isGPU, const kwargs_t& op_kwargs, @@ -78,11 +66,7 @@ static void RunCoreOpTimingTest(const bool isGPU, // prime code and cache before the performance runs test::op::CoreOperatorRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {20, 3, 128, 128}, kwargs, 1); -======= runner.RunBidirectional(false, { {20, 3, 128, 128} }, kwargs, 1); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 // Do the performance runs std::vector shapes; @@ -102,12 +86,8 @@ static void RunCoreOpTimingTest(const bool isGPU, } const char *pu = isGPU ? "GPU" : "CPU"; for (const TShape &shape : shapes) { -<<<<<<< HEAD - runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, shape); -======= runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, { shape }); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } } diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index 428622716ac1..d9a3795f46b1 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -42,11 +42,7 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { TShape shape({5, 5}); kwargs_t kwargs = basic_fullyconn_args; test::op::LegacyOpRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, shape, kwargs, 1); -======= runner.RunBidirectional(false, { shape }, kwargs, 1); ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 } /*! @@ -55,13 +51,9 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; test::op::LegacyOpRunner runner; -<<<<<<< HEAD - runner.RunBidirectional(false, {10, 10, 10, 10}, kwargs, 1); // prime code and cache -======= runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes; if (test::performance_run) { shapes = { @@ -91,13 +83,9 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { test::OperatorRunner> runner; -<<<<<<< HEAD - runner.RunBidirectional(true, {10, 10, 10, 10}, kwargs, 1); // prime code and cache -======= runner.RunBidirectional(true, { TShape({10, 10, 10, 10}) }, kwargs, 1); // prime code and cache ->>>>>>> 100eb88add1c5a18185226eebde0664cc313f557 std::vector shapes; if (test::performance_run) { shapes = { diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc index 5434a704c090..eaf9e3c21910 100644 --- a/tests/cpp/test_main.cc +++ b/tests/cpp/test_main.cc @@ -38,11 +38,12 @@ static bool dumpCallback(const google_breakpad::MinidumpDescriptor& descriptor, namespace mxnet { namespace test { bool unitTestsWithCuda = false; #ifdef NDEBUG -bool debugOutput = false; +bool debug_output = false; #else -bool debugOutput = false; +bool debug_output = false; #endif bool quick_test = false; +bool performance_run = false; }} #if MXNET_USE_CUDA @@ -85,7 +86,9 @@ int main(int argc, char ** argv) { // override (ie force attempt CUDA) mxnet::test::unitTestsWithCuda = true; } else if (!strcmp(argv[x], "--debug")) { - mxnet::test::debugOutput = true; + mxnet::test::debug_output = true; + } else if (!strcmp(argv[x], "--perf")) { + mxnet::test::performance_run = true; } else if (!strcmp(argv[x], "--quick") || !strcmp(argv[x], "-q")) { mxnet::test::quick_test = true; } From 4bb3701132cb0b1d428130af59a99c87d59c2d72 Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 8 Nov 2017 16:50:53 -0800 Subject: [PATCH 201/237] remove cpp perf quantize --- tests/cpp/operator/quantize_perf.cc | 108 ---------------------------- 1 file changed, 108 deletions(-) delete mode 100644 tests/cpp/operator/quantize_perf.cc diff --git a/tests/cpp/operator/quantize_perf.cc b/tests/cpp/operator/quantize_perf.cc deleted file mode 100644 index 689f9f5d1686..000000000000 --- a/tests/cpp/operator/quantize_perf.cc +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -#include -#include -#include "../../src/operator/activation-inl.h" -#include "../include/test_op_runner.h" -#include "../include/test_core_op.h" - -using namespace mxnet; - -using kwargs_t = test::op::kwargs_t; - -template -static void RunCoreOpBidirectional(const bool isGPU, - const kwargs_t& op_kwargs, - const char *op_name, - const char *backward_op_name = "") { - const TShape shape({5, 5}); - test::op::CoreOpExecutor op(isGPU, shape); - op.set_verbose(false); - - op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name)); - - PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs()); - PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); - op.Execute(); - PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs()); - if(op.HasBackward()) { - PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs()); - PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); - op.ExecuteBackward(); - PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs()); - } -} - - -template -static void RunCoreOpTimingTest(const bool isGPU, - const kwargs_t& op_kwargs, - const char *op_name, - const char *backward_op_name = "") { - const kwargs_t kwargs = test::op::CoreOpExecutor::ArgsWithOpName( - op_kwargs, op_name, backward_op_name); - - // prime code and cache before the performance runs - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, {20, 3, 128, 128}, kwargs, 1); - - // Do the performance runs - std::vector shapes; - if (test::performance_run) { - shapes = { - {1, 1, 28, 28}, - {1, 3, 28, 28}, - {50, 1, 18, 32}, - {50, 3, 18, 32}, - {20, 3, 128, 128} - }; - } else { - shapes = { - {1, 1, 28, 28}, - {50, 3, 18, 32}, - }; - } - const char *pu = isGPU ? "GPU" : "CPU"; - for (const TShape &shape : shapes) { - runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs, 2, 10, shape); - } -} - -///*! -// * \brief Generic bidirectional sanity test -// */ -//TEST(OMP_TUNING, ExecuteBidirectional) { -// RunCoreOpBidirectional(false, {}, "elemwise_add", "_backward_add"); -//} - -/*! - * \brief ActivationOp timing test for CPU - */ -TEST(OMP_TUNING, TimingCPU) { - RunCoreOpTimingTest(false, {}, "quantize_2bit", COREOP_BWD_OP_NAME_VALUE_NONE); -} - -#if MXNET_USE_CUDA == 1 -/*! - * \brief ActivationOp timing test for GPU - */ -TEST(OMP_TUNING, TimingGPU) { - RunCoreOpTimingTest(true, {}, "elemwise_add", "_backward_add"); -} -#endif // MXNET_USE_CUDA == 1 From 5c0114c5465ff59269c33c29c6277366294e41ff Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 8 Nov 2017 16:55:54 -0800 Subject: [PATCH 202/237] undo more changes --- example/image-classification/common/fit.py | 7 +- .../image-classification/train_imagenet.py | 2 + tests/python/unittest/test_operator.py | 132 +++++------------- 3 files changed, 44 insertions(+), 97 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index fd5174afcaad..88d6a4379df4 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -181,10 +181,11 @@ def fit(args, network, data_loader, **kwargs): if args.network == 'alexnet': # AlexNet will not converge using Xavier - initializer = mx.init.Normal(0.5) + initializer = mx.init.Normal() else: - initializer = mx.init.Xavier(rnd_type='gaussian', - factor_type="in", magnitude=2) + initializer = mx.init.Xavier( + rnd_type='gaussian', factor_type="in", magnitude=2) + # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), # evaluation metrices eval_metrics = ['accuracy'] diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py index 3d18fc5cc4fc..f465fbc5f469 100644 --- a/example/image-classification/train_imagenet.py +++ b/example/image-classification/train_imagenet.py @@ -22,6 +22,7 @@ from common import find_mxnet, data, fit from common.util import download_file import mxnet as mx + if __name__ == '__main__': # parse args parser = argparse.ArgumentParser(description="train imagenet-1k", @@ -52,5 +53,6 @@ from importlib import import_module net = import_module('symbols.'+args.network) sym = net.get_symbol(**vars(args)) + # train fit.fit(args, sym, data.get_rec_iter) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index fe2475cbaf5d..93dc4a05345a 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3538,6 +3538,7 @@ def test_quantization_op(): assert same(qa.asnumpy(), qa_real.asnumpy()) assert same(a_.asnumpy(), a_real.asnumpy()) + def test_reciprocal_op(): data_tmp = np.random.rand(3, 4) * 10 - 5 # Avoid possible division by 0 errors @@ -4310,100 +4311,6 @@ def check(data, idx): assert (mx.nd.scatter_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [2, 3]]).all() -def test_two_bit_quantization(): - threshold = 0.5 - orig_shape = [(16,), (25,),(1121),(144000),(1440000)] - num_repeat = 2 - from struct import pack,unpack - - def bits2int(bits): - bits = [int(x) for x in bits[::-1]] - x = 0 - for i in range(len(bits)): - x += bits[i]*2**i - return x - - def as_float32(s): - return unpack("f",pack("I", bits2int(s)))[0] - - def compute_expected(arr, curr_residual, threshold): - # str_quant stores the quantized representation as a sequence of bits - str_quant = '' - new_residual = [] - decompr = [] - arr_npy = arr.asnumpy() - curr_res_npy = curr_residual.asnumpy() - for i, a in np.ndenumerate(arr_npy): - a += curr_res_npy[i] - if a >= threshold: - str_quant += '11' - new_residual.append(a - threshold) - decompr.append(threshold) - elif a <= (-1*threshold): - str_quant += '10' - new_residual.append(a + threshold) - decompr.append(-1*threshold) - else: - str_quant += '00' - new_residual.append(a) - decompr.append(0) - # append extra bits when size of array not a factor of 16 - if len(str_quant)%16 != 0: - str_quant += '0'*(16 - len(str_quant)%16) - - compr = [] - # converts the string generated into integers 32chars at a time - i = 0 - while i Date: Wed, 8 Nov 2017 17:25:28 -0800 Subject: [PATCH 203/237] add inactive function so that multiple kvstore dist inits have no compression fix tests --- include/mxnet/gc.h | 5 ++ src/kvstore/gc.cc | 4 ++ src/kvstore/kvstore_dist.h | 8 +++ src/kvstore/kvstore_local.h | 1 - tests/nightly/dist_sync_kvstore.py | 85 +++++++++++++++++++++++------- tests/nightly/test_kvstore.py | 2 +- 6 files changed, 85 insertions(+), 20 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index bced8a91d2a2..d9cdd3f11946 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -55,6 +55,11 @@ class Gc { */ void set_active(); + /*! + * \brief sets gradient compression to inactive mode + */ + void set_inactive(); + /*! * \brief returns boolean whether or not gc is in active mode */ diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index a64ce4ab7258..1160bca7ed77 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -62,6 +62,10 @@ void Gc::set_active() { active_ = true; } +void Gc::set_inactive() { + active_ = false; +} + bool Gc::get_active() { return active_; } diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 757a5756f701..295827e4e2af 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -177,6 +177,11 @@ class KVStoreDist : public KVStoreLocal { comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } if (get_rank() == 0) { + // set inactive for inits + if (gc_->get_active()) { + gc_->set_inactive(); + } + Push_(keys, values, 0, false); // wait until the push is finished for (const int key : keys) { @@ -301,7 +306,10 @@ class KVStoreDist : public KVStoreLocal { std::vector uniq_keys; std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); + + // set active for non init pushes if (do_merge && !gc_->get_active()) gc_->set_active(); + for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 756a6f243e8b..854b3600c73c 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -150,7 +150,6 @@ class KVStoreLocal : public KVStore { local_[keys[i]] = values[i].Copy(pinned_ctx_); comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } - //TODO verify if comm destruction doesn't cause double free memory corruption comm_->SetGradientCompression(gc_); gc_->set_active(); } diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 18c9e1b33fca..5e5fbe3b63e9 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -183,6 +183,51 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): expected[row] = updated_val[row] check_diff_to_scalar(val, expected, rank=my_rank) + def compute_expected(arr, curr_residual, threshold): + from struct import pack,unpack + def bits2int(bits): + bits = [int(x) for x in bits[::-1]] + x = 0 + for i in range(len(bits)): + x += bits[i]*2**i + return x + + def as_float32(s): + return unpack("f",pack("I", bits2int(s)))[0] + + # str_quant stores the quantized representation as a sequence of bits + str_quant = '' + new_residual = [] + decompr = [] + arr_npy = arr.asnumpy() + curr_res_npy = curr_residual.asnumpy() + for i, a in np.ndenumerate(arr_npy): + a += curr_res_npy[i] + if a >= threshold: + str_quant += '11' + new_residual.append(a - threshold) + decompr.append(threshold) + elif a <= (-1*threshold): + str_quant += '10' + new_residual.append(a + threshold) + decompr.append(-1*threshold) + else: + str_quant += '00' + new_residual.append(a) + decompr.append(0) + # append extra bits when size of array not a factor of 16 + if len(str_quant)%16 != 0: + str_quant += '0'*(16 - len(str_quant)%16) + + compr = [] + # converts the string generated into integers 32chars at a time + i = 0 + while i Date: Wed, 8 Nov 2017 17:32:36 -0800 Subject: [PATCH 204/237] undo some formatting changes --- src/kvstore/comm.h | 1 - src/kvstore/kvstore_dist.h | 8 -------- src/kvstore/kvstore_dist_server.h | 13 ++++++------- src/kvstore/kvstore_local.h | 1 - tests/nightly/dist_sync_kvstore.py | 2 +- tools/bandwidth/measure.py | 3 --- 6 files changed, 7 insertions(+), 21 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index dbce922baa1d..5dace746b561 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -706,7 +706,6 @@ class CommDevice : public Comm { std::vector small_recv_buf; }; std::unordered_map merge_buf_; - bool inited_; }; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 295827e4e2af..6c3794fda14c 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -31,7 +31,6 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" -#include "../ndarray/ndarray_function.h" #if MKL_EXPERIMENTAL == 1 #include #include "../operator/mkl/mkl_memory-inl.h" @@ -701,39 +700,32 @@ class KVStoreDist : public KVStoreLocal { * \brief for worker to push and pull data */ ps::KVWorker* ps_worker_; - /** * \brief the server handle */ KVStoreDistServer* server_; - /** * \brief threshold for partition */ size_t bigarray_bound_; - /** * \brief buffer for non-compressed data. * When gradient compression is active, this is used * for the data in pull and for original data in push */ std::unordered_map comm_buf_; - /** * \brief buffer for compressed data * Used when gradient compression is active and action * is push */ std::unordered_map compr_buf_; - /** * \brief residual buffer to accumulate quantization error * during gradient compression */ std::unordered_map residual_; - bool log_verbose_; - }; } // namespace kvstore diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 908109798fd7..42d15ba1ae2e 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -35,7 +35,6 @@ #include "mxnet/kvstore.h" #include "../operator/tensor/elemwise_binary_op-inl.h" #include "../operator/tensor/init_op.h" -#include "../ndarray/ndarray_function.h" namespace mxnet { namespace kvstore { @@ -475,7 +474,7 @@ class KVStoreDistServer { if (stored.is_none()) { // initialization stored = NDArray(dshape, Context()); - CopyFromTo(recved, &stored, 0); + CopyFromTo(recved, &stored, 0); server->Response(req_meta); stored.WaitToRead(); } else if (sync_mode_) { @@ -485,17 +484,17 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - CopyFromTo(recved, &merged.array, 0); + CopyFromTo(recved, &merged.array, 0); } else { - merged.array += recved; + merged.array += recved; } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); } else { // async push - exec_.Exec([this, key, &recved, &stored]() { - CHECK(updater_); - updater_(key, recved, &stored); + exec_.Exec([this, key, &recved, &stored]() { + CHECK(updater_); + updater_(key, recved, &stored); }); server->Response(req_meta); stored.WaitToRead(); diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 854b3600c73c..2d3d55eb6030 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -387,7 +387,6 @@ class KVStoreLocal : public KVStore { std::unordered_set warnings_printed_; /// whether int or string is used for keys KeyType key_type_ = kUndefinedKey; - }; } // namespace kvstore } // namespace mxnet diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 5e5fbe3b63e9..cbb1c3a51806 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -346,5 +346,5 @@ def check_init(kv, cur_keys, cur_shape, device=False): print('worker ' + str(my_rank) + ' is initialized') if __name__ == "__main__": - # test_sync_init() + test_sync_init() test_sync_push_pull() diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index 472f954b14b8..cc8379dfc0b6 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -92,9 +92,6 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, data_shape = (32,) + tuple([int(s) for s in image_shape.split(',')]) shapes = get_shapes(symbol, data_shape) - - for s in shapes: - print(s) size = float(sum([reduce(lambda x,y : x*y, s, 1) for s in shapes])) * 4 / 1e6 logging.info('num of arrays = %d, total size = %f MB' % (len(shapes), size)) From 80957a7f2ea59b8ffb3a8ece341b107f5fe51506 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 9 Nov 2017 15:32:50 -0800 Subject: [PATCH 205/237] make sharding same when inactive and active --- include/mxnet/gc.h | 6 +++++- src/kvstore/gc.cc | 26 ++++++++++++++++++++++++++ src/kvstore/kvstore_dist.h | 29 +++++++++++++++++------------ 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index d9cdd3f11946..959a4803479d 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -65,12 +65,16 @@ class Gc { */ bool get_active(); + CompressionType get_type(); + /*! * \brief if gc is in active mode, returns type of compression set * else returns GC_NONE */ bool get_active_type(); + void increment_push(int key); + void SetTwoBitCompression(const float threshold); /*! @@ -134,7 +138,7 @@ class Gc { * all negative gradients will be thresholded to -1*`threshold_` */ float threshold_ = 0; - + std::unordered_map num_pushes_; }; } // namespace kvstore } // namespace mxnet diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 1160bca7ed77..e573a196aaf0 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -25,6 +25,8 @@ #include #include #include +// for get_rank +#include #include "./gc-inl.h" namespace mxnet { @@ -70,11 +72,34 @@ bool Gc::get_active() { return active_; } +CompressionType Gc::get_type() { + return type_; +} + bool Gc::get_active_type() { if (active_) return type_; else return GC_NONE; } +void Gc::increment_push(int key) { +// if (!get_active()) { + std::unordered_map::const_iterator got = num_pushes_.find(key); + if (got == num_pushes_.end()) { + // first push is init, so not counting that + num_pushes_[key] = 0; + } else { + num_pushes_[key] += 1; + } +// if(ps::MyRank()==0) std::cout<<"numpush for key "< 0) { +// if(ps::MyRank()==0) std::cout<<"would set active "<shape().Size()<ctx().dev_mask(); diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 6c3794fda14c..e6c68d54bf00 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -312,6 +312,7 @@ class KVStoreDist : public KVStoreLocal { for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; + gc_->increment_push(key); const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; @@ -336,14 +337,22 @@ class KVStoreDist : public KVStoreLocal { // push to servers if (storage_type == kDefaultStorage) { - if (gc_->get_active_type() == GC_NONE) { - std::cout<<"gc is none for push of key"<get_type() == GC_NONE) { + PSKV& pskv = EncodeDefaultKey(key, comm_buf.shape().Size(), true); + PushDefault(key, comm_buf, pskv, priority); } else { - PushCompressed(key, comm_buf, priority); + // returns push_pskv if active, else pull_pskv + // we want inactive gc to send uncompressed gradients, but sharded same as active gc + // but calculates both push and pull pskv + PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->get_active()); + if (gc_->get_active()) { + PushCompressed(key, comm_buf, pskv, priority); + } else { + PushDefault(key, comm_buf, pskv, priority); + } } } else if (storage_type == kRowSparseStorage) { - if (gc_->get_active_type() != GC_NONE) { + if (gc_->get_type() != GC_NONE) { LOG(FATAL) << "Gradient compression for row sparse storage type is not supported"; } PushRowSparse(key, comm_buf, priority); @@ -353,14 +362,11 @@ class KVStoreDist : public KVStoreLocal { } } - void PushCompressed(int key, const NDArray& comm_buf, int priority) { + void PushCompressed(int key, const NDArray& comm_buf, PSKV& pskv, int priority) { auto &small_buf = compr_buf_[key]; auto &res_buf = residual_[key]; size_t original_size = comm_buf.shape().Size(); - // returns push_pskv but calculates both push and pull pskv - PSKV &pskv = EncodeCompressedKey(key, original_size, true); - // Init the small buffer and residual_ buffer for quantize if (small_buf.is_none()) { small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype()); @@ -393,16 +399,15 @@ class KVStoreDist : public KVStoreLocal { PROFILER_MESSAGE("KVStoreDistCompressedPush")); } - void PushDefault(int key, const NDArray &send_buf, int priority) { + void PushDefault(int key, const NDArray &send_buf, const PSKV& pskv, int priority) { auto push_to_servers = - [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, pskv, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = send_buf.shape().Size(); real_t* data = send_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 mkl_set_tblob_eager_mode(send_buf.data()); #endif - PSKV& pskv = EncodeDefaultKey(key, size, true); // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( From 222f33c0b7b4f9b4bcb42b1a0fb99a9a361577ec Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 9 Nov 2017 15:52:44 -0800 Subject: [PATCH 206/237] remove counts and get_active_type --- include/mxnet/gc.h | 21 +++++---------------- src/kvstore/comm.h | 2 +- src/kvstore/gc.cc | 35 ++++------------------------------- src/kvstore/kvstore_dist.h | 15 ++++++--------- src/kvstore/kvstore_local.h | 2 +- 5 files changed, 17 insertions(+), 58 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 959a4803479d..e95613843aee 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -49,31 +49,21 @@ class Gc { void SetParams(const std::string &compression_type, const float threshold); /*! - * \brief sets gradient compression to active mode + * \brief sets gradient compression to given mode * Active mode is when gradients are compressed * Gc is in inactive mode during init of parameters */ - void set_active(); - - /*! - * \brief sets gradient compression to inactive mode - */ - void set_inactive(); + void set_active(bool active); /*! * \brief returns boolean whether or not gc is in active mode */ - bool get_active(); - - CompressionType get_type(); + bool is_active(); /*! - * \brief if gc is in active mode, returns type of compression set - * else returns GC_NONE + * \brief returns type of compression if any */ - bool get_active_type(); - - void increment_push(int key); + CompressionType get_type(); void SetTwoBitCompression(const float threshold); @@ -138,7 +128,6 @@ class Gc { * all negative gradients will be thresholded to -1*`threshold_` */ float threshold_ = 0; - std::unordered_map num_pushes_; }; } // namespace kvstore } // namespace mxnet diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 5dace746b561..b37c89fb33fb 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -491,7 +491,7 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_->get_active_type() != GC_NONE) { + if (gc_->is_active()) { return ReduceCompressed(key, src, priority); } diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index e573a196aaf0..2bdf27a15e2e 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -60,15 +60,12 @@ void Gc::SetParams(const std::string &compression_type, const float threshold) { } } -void Gc::set_active() { - active_ = true; +void Gc::set_active(bool active) { + active_ = active; } -void Gc::set_inactive() { - active_ = false; -} - -bool Gc::get_active() { +// can only be active when type is not GC_NONE +bool Gc::is_active() { return active_; } @@ -76,30 +73,6 @@ CompressionType Gc::get_type() { return type_; } -bool Gc::get_active_type() { - if (active_) return type_; - else return GC_NONE; -} - -void Gc::increment_push(int key) { -// if (!get_active()) { - std::unordered_map::const_iterator got = num_pushes_.find(key); - if (got == num_pushes_.end()) { - // first push is init, so not counting that - num_pushes_[key] = 0; - } else { - num_pushes_[key] += 1; - } -// if(ps::MyRank()==0) std::cout<<"numpush for key "< 0) { -// if(ps::MyRank()==0) std::cout<<"would set active "<get_active()) { - gc_->set_inactive(); - } + if (gc_->is_active()) gc_->set_active(false); Push_(keys, values, 0, false); // wait until the push is finished @@ -225,7 +223,7 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = (gc_->get_active_type() == GC_NONE) ? + PSKV& pskv = (gc_->get_type() == GC_NONE) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); #if MKL_EXPERIMENTAL == 1 @@ -235,7 +233,7 @@ class KVStoreDist : public KVStoreLocal { // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); // issue pull - int cmd = (gc_->get_active_type() == GC_NONE) ? kDefaultPushPull : kCompressedPushPull; + int cmd = (gc_->get_type() != GC_NONE) ? kCompressedPushPull : kDefaultPushPull; CHECK_NOTNULL(ps_worker_)->ZPull( pskv.keys, vals, &pskv.lens, cmd, [vals, cb](){ delete vals; cb(); }); }; @@ -307,12 +305,11 @@ class KVStoreDist : public KVStoreLocal { GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); // set active for non init pushes - if (do_merge && !gc_->get_active()) gc_->set_active(); + if (do_merge && !gc_->is_active()) gc_->set_active(true); for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; - gc_->increment_push(key); const auto& vals = grouped_vals[i]; NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0]; @@ -344,8 +341,8 @@ class KVStoreDist : public KVStoreLocal { // returns push_pskv if active, else pull_pskv // we want inactive gc to send uncompressed gradients, but sharded same as active gc // but calculates both push and pull pskv - PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->get_active()); - if (gc_->get_active()) { + PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->is_active()); + if (gc_->is_active()) { PushCompressed(key, comm_buf, pskv, priority); } else { PushDefault(key, comm_buf, pskv, priority); diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 2d3d55eb6030..e89c2db51f22 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -151,7 +151,7 @@ class KVStoreLocal : public KVStore { comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } comm_->SetGradientCompression(gc_); - gc_->set_active(); + gc_->set_active(true); } virtual void PushImpl(const std::vector& keys, From dc3b8e6ee74e40c23d8f1686be21fece245f2680 Mon Sep 17 00:00:00 2001 From: Rahul Date: Thu, 9 Nov 2017 16:04:39 -0800 Subject: [PATCH 207/237] remove print --- src/kvstore/gc.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 2bdf27a15e2e..9e05cdeb76a2 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -115,7 +115,6 @@ int64_t Gc::GetCompressedSize(const int64_t original_size) { void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, mxnet::NDArray *residual, const int priority) { - if(ps::MyRank()==0) std::cout<shape().Size()<ctx().dev_mask(); From ac55cdc7caf1247fa06b547072e05b1c3548c1a0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 10 Nov 2017 13:50:29 -0800 Subject: [PATCH 208/237] add train caltech --- example/image-classification/train_caltech.py | 59 +++++++++++++++++++ src/kvstore/comm.h | 2 +- src/kvstore/gc.cc | 5 +- 3 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 example/image-classification/train_caltech.py diff --git a/example/image-classification/train_caltech.py b/example/image-classification/train_caltech.py new file mode 100644 index 000000000000..53e5702b3d13 --- /dev/null +++ b/example/image-classification/train_caltech.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import argparse +import logging +logging.basicConfig(level=logging.DEBUG) +from common import find_mxnet, data, fit + +if __name__ == '__main__': + train_fname = os.path.join("data", "caltech-256-60-val.rec") + val_fname = os.path.join("data", "caltech-256-60-train.rec") + # parse args + parser = argparse.ArgumentParser(description="train caltech256", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + fit.add_fit_args(parser) + data.add_data_args(parser) + data.add_data_aug_args(parser) + data.set_data_aug_level(parser, 2) + parser.set_defaults( + # network + network = 'mlp', + num_layers = 110, + # data + data_train = train_fname, + data_val = val_fname, + num_classes = 256, + num_examples = 15420, + image_shape = '3,256,256', + pad_size = 4, + # train + batch_size = 128, + num_epochs = 300, + lr = .05, + lr_step_epochs = '200,250', + ) + args = parser.parse_args() + + # load network + from importlib import import_module + net = import_module('symbols.'+args.network) + sym = net.get_symbol(**vars(args)) + + # train + fit.fit(args, sym, data.get_rec_iter) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index b37c89fb33fb..04188dec203b 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -491,7 +491,7 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_->is_active()) { + if (gc_->get_type() != GC_NONE && gc_->is_active()) { return ReduceCompressed(key, src, priority); } diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 9e05cdeb76a2..e152e3c4cc8a 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -63,8 +63,7 @@ void Gc::SetParams(const std::string &compression_type, const float threshold) { void Gc::set_active(bool active) { active_ = active; } - -// can only be active when type is not GC_NONE +// can be active when type is none bool Gc::is_active() { return active_; } @@ -101,7 +100,7 @@ int Gc::GetCompressionFactor() { if (type_ == GC_TWO_BIT) { return 16; } else { - LOG(FATAL) << "Unsupported compression type"; + LOG(FATAL) << "Unsupported compression type: " << type_; return 0; } } From 48d54dfe58786f1e7a8c609eacfba82f1839ca90 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 10 Nov 2017 15:06:14 -0800 Subject: [PATCH 209/237] increase size of mlp --- dmlc-core | 2 +- example/image-classification/symbols/mlp.py | 12 +++++++----- example/speech_recognition/main.py | 1 - 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dmlc-core b/dmlc-core index 595d02c0e87b..be05e33f16ef 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39 +Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f diff --git a/example/image-classification/symbols/mlp.py b/example/image-classification/symbols/mlp.py index 0aaa38c44a32..f2330c8fdef3 100644 --- a/example/image-classification/symbols/mlp.py +++ b/example/image-classification/symbols/mlp.py @@ -23,12 +23,14 @@ def get_symbol(num_classes=10, **kwargs): data = mx.symbol.Variable('data') data = mx.sym.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1500) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1536) act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1500) + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1536) act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1500) + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1536) act3 = mx.symbol.Activation(data = fc3, name='relu3', act_type="relu") - fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=3000) - mlp = mx.symbol.SoftmaxOutput(data = fc4, name = 'softmax') + fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=1536) + act4 = mx.symbol.Activation(data = fc4, name='relu4', act_type="relu") + fc5 = mx.symbol.FullyConnected(data = act4, name='fc5', num_hidden=3101) + mlp = mx.symbol.SoftmaxOutput(data = fc5, name = 'softmax') return mlp diff --git a/example/speech_recognition/main.py b/example/speech_recognition/main.py index e45026343de7..f041af4a95d2 100644 --- a/example/speech_recognition/main.py +++ b/example/speech_recognition/main.py @@ -286,7 +286,6 @@ def load_model(args, contexts, data_train): 'train or predict or load can be the candidate for the mode.') # get meta file where character to number conversions are defined - contexts = parse_contexts(args) num_gpu = len(contexts) batch_size = args.config.getint('common', 'batch_size') From eea86ff284175e60520bea0117174bf13a97845c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 11 Nov 2017 00:59:28 +0000 Subject: [PATCH 210/237] update to alexa mlp --- example/image-classification/symbols/mlp.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/example/image-classification/symbols/mlp.py b/example/image-classification/symbols/mlp.py index 0aaa38c44a32..f2330c8fdef3 100644 --- a/example/image-classification/symbols/mlp.py +++ b/example/image-classification/symbols/mlp.py @@ -23,12 +23,14 @@ def get_symbol(num_classes=10, **kwargs): data = mx.symbol.Variable('data') data = mx.sym.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1500) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1536) act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1500) + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1536) act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1500) + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1536) act3 = mx.symbol.Activation(data = fc3, name='relu3', act_type="relu") - fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=3000) - mlp = mx.symbol.SoftmaxOutput(data = fc4, name = 'softmax') + fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=1536) + act4 = mx.symbol.Activation(data = fc4, name='relu4', act_type="relu") + fc5 = mx.symbol.FullyConnected(data = act4, name='fc5', num_hidden=3101) + mlp = mx.symbol.SoftmaxOutput(data = fc5, name = 'softmax') return mlp From aa6fb6fb516782bc108d9373b362ad7a46b85129 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 11 Nov 2017 01:24:44 +0000 Subject: [PATCH 211/237] pass-env changes --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index be05e33f16ef..0c57ebabecce 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f +Subproject commit 0c57ebabecce093d15d2cb2c061e6ec1165d6937 From b694f159f77ac91ef2dd608e495174a5f2f6b8dc Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 11 Nov 2017 23:53:09 -0800 Subject: [PATCH 212/237] add bucketing module compression --- example/image-classification/train_imagenet.py | 2 ++ example/rnn/lstm_bucketing.py | 5 ++++- python/mxnet/module/bucketing_module.py | 10 +++++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py index f465fbc5f469..dd45bf3f8ef3 100644 --- a/example/image-classification/train_imagenet.py +++ b/example/image-classification/train_imagenet.py @@ -33,6 +33,8 @@ # use a large aug level data.set_data_aug_level(parser, 3) parser.set_defaults( + data_train = 'data/imagenet1k-train-t3.rec', + data_train_idx = 'data/imagenet1k-train-t3.rec', # network network = 'resnet', num_layers = 50, diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py index 2e7bc65d437a..93c7821a7264 100644 --- a/example/rnn/lstm_bucketing.py +++ b/example/rnn/lstm_bucketing.py @@ -47,6 +47,8 @@ help='the batch size.') parser.add_argument('--disp-batches', type=int, default=50, help='show progress for every n batches') +parser.add_argument('--gc-type', type=str, default='none', + help='type of gradient compression') def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): @@ -111,7 +113,8 @@ def sym_gen(seq_len): model = mx.mod.BucketingModule( sym_gen = sym_gen, default_bucket_key = data_train.default_bucket_key, - context = contexts) + context = contexts, + compression_params = {'compression':args.gc_type}) model.fit( train_data = data_train, diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py index f3c7ecbddc05..1396eb575262 100644 --- a/python/mxnet/module/bucketing_module.py +++ b/python/mxnet/module/bucketing_module.py @@ -55,7 +55,8 @@ class BucketingModule(BaseModule): """ def __init__(self, sym_gen, default_bucket_key=None, logger=logging, context=ctx.cpu(), work_load_list=None, - fixed_param_names=None, state_names=None): + fixed_param_names=None, state_names=None, + compression_params=None): super(BucketingModule, self).__init__(logger=logger) assert default_bucket_key is not None @@ -73,6 +74,7 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging, _check_input_names(symbol, state_names, "state", True) _check_input_names(symbol, fixed_param_names, "fixed_param", True) + self._compression_params = compression_params self._fixed_param_names = fixed_param_names self._state_names = state_names self._context = context @@ -319,7 +321,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, module = Module(symbol, data_names, label_names, logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names, - state_names=self._state_names) + state_names=self._state_names, + compression_params=self._compression_params) module.bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind=False, shared_module=None, grad_req=grad_req) self._curr_module = module @@ -349,7 +352,8 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None): logger=self.logger, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names, - state_names=self._state_names) + state_names=self._state_names, + compression_params=self._compression_params) module.bind(data_shapes, label_shapes, self._curr_module.for_training, self._curr_module.inputs_need_grad, force_rebind=False, shared_module=self._buckets[self._default_bucket_key]) From b84f1797c56abf1d075a1dd4eb9f1c041bd7343e Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Mon, 13 Nov 2017 16:31:26 -0800 Subject: [PATCH 213/237] attempts for alexnet training --- dmlc-core | 2 +- example/image-classification/common/fit.py | 7 +- .../image-classification/symbols/alexnet.py | 6 +- .../image-classification/train_imagenet_gc.py | 65 +++++++++++++++++++ 4 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 example/image-classification/train_imagenet_gc.py diff --git a/dmlc-core b/dmlc-core index 0c57ebabecce..be05e33f16ef 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 0c57ebabecce093d15d2cb2c061e6ec1165d6937 +Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 88d6a4379df4..65e53f0f3ce7 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -181,14 +181,17 @@ def fit(args, network, data_loader, **kwargs): if args.network == 'alexnet': # AlexNet will not converge using Xavier - initializer = mx.init.Normal() + initializer = mx.init.Mixed(['conv2_bias', 'conv4_bias','conv5_bias', + 'conv1_bias', 'conv3_bias','.*'], + [mx.init.Constant(0.1), mx.init.Constant(0.1), mx.init.Constant(0.1), + mx.init.Zero(), mx.init.Zero(), mx.init.Normal()]) else: initializer = mx.init.Xavier( rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), # evaluation metrices - eval_metrics = ['accuracy'] + eval_metrics = ['accuracy','ce'] if args.top_k > 0: eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=args.top_k)) diff --git a/example/image-classification/symbols/alexnet.py b/example/image-classification/symbols/alexnet.py index f945b9f87cd9..3b6066c6d92a 100755 --- a/example/image-classification/symbols/alexnet.py +++ b/example/image-classification/symbols/alexnet.py @@ -31,14 +31,14 @@ def get_symbol(num_classes, dtype='float32', **kwargs): conv1 = mx.sym.Convolution(name='conv1', data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96) relu1 = mx.sym.Activation(data=conv1, act_type="relu") - lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5) + lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=1, nsize=4) pool1 = mx.sym.Pooling( data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2)) # stage 2 conv2 = mx.sym.Convolution(name='conv2', - data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256) + data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=192) relu2 = mx.sym.Activation(data=conv2, act_type="relu") - lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5) + lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=1, nsize=4) pool2 = mx.sym.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max") # stage 3 conv3 = mx.sym.Convolution(name='conv3', diff --git a/example/image-classification/train_imagenet_gc.py b/example/image-classification/train_imagenet_gc.py new file mode 100644 index 000000000000..eaa4811c205c --- /dev/null +++ b/example/image-classification/train_imagenet_gc.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import argparse +import logging +logging.basicConfig(level=logging.DEBUG) +from common import find_mxnet, data, fit +from common.util import download_file +import mxnet as mx + +if __name__ == '__main__': + # parse args + parser = argparse.ArgumentParser(description="train imagenet gc", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + fit.add_fit_args(parser) + data.add_data_args(parser) + data.add_data_aug_args(parser) + # use a large aug level + data.set_data_aug_level(parser, 3) + parser.set_defaults( + data_train = 'data/imagenet2012-train.rec', + data_train_idx = 'data/imagenet2012-train.idx', + data_val = 'data/imagenet1k-val.rec', + data_val_idx = 'data/imagenet1k-val.idx', + + # network + network = 'resnet', + num_layers = 50, + + # data + num_classes = 1000, + num_examples = 1281167, + image_shape = '3,224,224', + min_random_scale = 1, # if input image has min size k, suggest to use + # 256.0/x, e.g. 0.533 for 480 + # train + num_epochs = 80, + lr_step_epochs = '30,60', + dtype = 'float32' + ) + args = parser.parse_args() + + # load network + from importlib import import_module + net = import_module('symbols.'+args.network) + sym = net.get_symbol(**vars(args)) + + # train + fit.fit(args, sym, data.get_rec_iter) + From 257888375cb406e3322d17832b6837782fa17644 Mon Sep 17 00:00:00 2001 From: Rahul Date: Tue, 14 Nov 2017 23:27:04 -0800 Subject: [PATCH 214/237] prepare for merge --- dmlc-core | 2 +- example/image-classification/common/fit.py | 48 +++++++------- .../image-classification/symbols/alexnet.py | 6 +- example/image-classification/symbols/mlp.py | 12 ++-- .../image-classification/train_imagenet.py | 2 - .../image-classification/train_imagenet_gc.py | 65 ------------------- example/rnn/lstm_bucketing.py | 5 +- example/speech_recognition/main.py | 1 + include/mxnet/gc.h | 6 +- include/mxnet/kvstore.h | 8 +-- python/mxnet/kvstore.py | 7 +- python/mxnet/model.py | 1 + python/mxnet/module/bucketing_module.py | 5 ++ python/mxnet/module/module.py | 1 - src/c_api/c_api.cc | 4 +- src/kvstore/comm.h | 9 ++- src/kvstore/gc-inl.h | 14 ++-- src/kvstore/gc.cc | 17 +++-- src/kvstore/kvstore_dist.h | 10 +-- src/kvstore/kvstore_dist_server.h | 6 +- tools/launch.py | 6 +- 21 files changed, 89 insertions(+), 146 deletions(-) delete mode 100644 example/image-classification/train_imagenet_gc.py diff --git a/dmlc-core b/dmlc-core index be05e33f16ef..595d02c0e87b 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit be05e33f16ef5e0be38a410da30c761158263c8f +Subproject commit 595d02c0e87be8a0846700462b6f45f1b1031e39 diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index 65e53f0f3ce7..c60772dd495b 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -119,7 +119,10 @@ def fit(args, network, data_loader, **kwargs): """ # kvstore kv = mx.kvstore.create(args.kv_store) - kv.set_gradient_compression({'compression':args.gc_type, 'threshold':args.gc_threshold}) + if args.gc_type != 'none': + kv.set_gradient_compression({'compression': args.gc_type, + 'threshold': args.gc_threshold}) + # logging head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) @@ -167,10 +170,10 @@ def fit(args, network, data_loader, **kwargs): lr_scheduler = lr_scheduler optimizer_params = { - 'learning_rate': lr, - 'wd' : args.wd, - 'lr_scheduler': lr_scheduler, - 'multi_precision': True} + 'learning_rate': lr, + 'wd' : args.wd, + 'lr_scheduler': lr_scheduler, + 'multi_precision': True} # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag'} @@ -181,17 +184,14 @@ def fit(args, network, data_loader, **kwargs): if args.network == 'alexnet': # AlexNet will not converge using Xavier - initializer = mx.init.Mixed(['conv2_bias', 'conv4_bias','conv5_bias', - 'conv1_bias', 'conv3_bias','.*'], - [mx.init.Constant(0.1), mx.init.Constant(0.1), mx.init.Constant(0.1), - mx.init.Zero(), mx.init.Zero(), mx.init.Normal()]) + initializer = mx.init.Normal() else: initializer = mx.init.Xavier( rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), # evaluation metrices - eval_metrics = ['accuracy','ce'] + eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=args.top_k)) @@ -203,17 +203,17 @@ def fit(args, network, data_loader, **kwargs): # run model.fit(train, - begin_epoch = args.load_epoch if args.load_epoch else 0, - num_epoch = args.num_epochs, - eval_data = val, - eval_metric = eval_metrics, - kvstore = kv, - optimizer = args.optimizer, - optimizer_params = optimizer_params, - initializer = initializer, - arg_params = arg_params, - aux_params = aux_params, - batch_end_callback = batch_end_callbacks, - epoch_end_callback = checkpoint, - allow_missing = True, - monitor = monitor) + begin_epoch = args.load_epoch if args.load_epoch else 0, + num_epoch = args.num_epochs, + eval_data = val, + eval_metric = eval_metrics, + kvstore = kv, + optimizer = args.optimizer, + optimizer_params = optimizer_params, + initializer = initializer, + arg_params = arg_params, + aux_params = aux_params, + batch_end_callback = batch_end_callbacks, + epoch_end_callback = checkpoint, + allow_missing = True, + monitor = monitor) diff --git a/example/image-classification/symbols/alexnet.py b/example/image-classification/symbols/alexnet.py index 3b6066c6d92a..f945b9f87cd9 100755 --- a/example/image-classification/symbols/alexnet.py +++ b/example/image-classification/symbols/alexnet.py @@ -31,14 +31,14 @@ def get_symbol(num_classes, dtype='float32', **kwargs): conv1 = mx.sym.Convolution(name='conv1', data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96) relu1 = mx.sym.Activation(data=conv1, act_type="relu") - lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=1, nsize=4) + lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5) pool1 = mx.sym.Pooling( data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2)) # stage 2 conv2 = mx.sym.Convolution(name='conv2', - data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=192) + data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256) relu2 = mx.sym.Activation(data=conv2, act_type="relu") - lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=1, nsize=4) + lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5) pool2 = mx.sym.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max") # stage 3 conv3 = mx.sym.Convolution(name='conv3', diff --git a/example/image-classification/symbols/mlp.py b/example/image-classification/symbols/mlp.py index f2330c8fdef3..4b190b29db9e 100644 --- a/example/image-classification/symbols/mlp.py +++ b/example/image-classification/symbols/mlp.py @@ -23,14 +23,10 @@ def get_symbol(num_classes=10, **kwargs): data = mx.symbol.Variable('data') data = mx.sym.Flatten(data=data) - fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=1536) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") - fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 1536) + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") - fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=1536) - act3 = mx.symbol.Activation(data = fc3, name='relu3', act_type="relu") - fc4 = mx.symbol.FullyConnected(data = act3, name='fc4', num_hidden=1536) - act4 = mx.symbol.Activation(data = fc4, name='relu4', act_type="relu") - fc5 = mx.symbol.FullyConnected(data = act4, name='fc5', num_hidden=3101) - mlp = mx.symbol.SoftmaxOutput(data = fc5, name = 'softmax') + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') return mlp diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py index dd45bf3f8ef3..f465fbc5f469 100644 --- a/example/image-classification/train_imagenet.py +++ b/example/image-classification/train_imagenet.py @@ -33,8 +33,6 @@ # use a large aug level data.set_data_aug_level(parser, 3) parser.set_defaults( - data_train = 'data/imagenet1k-train-t3.rec', - data_train_idx = 'data/imagenet1k-train-t3.rec', # network network = 'resnet', num_layers = 50, diff --git a/example/image-classification/train_imagenet_gc.py b/example/image-classification/train_imagenet_gc.py deleted file mode 100644 index eaa4811c205c..000000000000 --- a/example/image-classification/train_imagenet_gc.py +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import argparse -import logging -logging.basicConfig(level=logging.DEBUG) -from common import find_mxnet, data, fit -from common.util import download_file -import mxnet as mx - -if __name__ == '__main__': - # parse args - parser = argparse.ArgumentParser(description="train imagenet gc", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - fit.add_fit_args(parser) - data.add_data_args(parser) - data.add_data_aug_args(parser) - # use a large aug level - data.set_data_aug_level(parser, 3) - parser.set_defaults( - data_train = 'data/imagenet2012-train.rec', - data_train_idx = 'data/imagenet2012-train.idx', - data_val = 'data/imagenet1k-val.rec', - data_val_idx = 'data/imagenet1k-val.idx', - - # network - network = 'resnet', - num_layers = 50, - - # data - num_classes = 1000, - num_examples = 1281167, - image_shape = '3,224,224', - min_random_scale = 1, # if input image has min size k, suggest to use - # 256.0/x, e.g. 0.533 for 480 - # train - num_epochs = 80, - lr_step_epochs = '30,60', - dtype = 'float32' - ) - args = parser.parse_args() - - # load network - from importlib import import_module - net = import_module('symbols.'+args.network) - sym = net.get_symbol(**vars(args)) - - # train - fit.fit(args, sym, data.get_rec_iter) - diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py index 93c7821a7264..0f5791e0e0b8 100644 --- a/example/rnn/lstm_bucketing.py +++ b/example/rnn/lstm_bucketing.py @@ -49,7 +49,8 @@ help='show progress for every n batches') parser.add_argument('--gc-type', type=str, default='none', help='type of gradient compression') - +parser.add_argument('--gc-threshold', type=float, default=0.5, + help='threshold for 2bit gradient compression') def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): if not os.path.isfile(fname): @@ -114,7 +115,7 @@ def sym_gen(seq_len): sym_gen = sym_gen, default_bucket_key = data_train.default_bucket_key, context = contexts, - compression_params = {'compression':args.gc_type}) + compression_params = {'compression': args.gc_type}) model.fit( train_data = data_train, diff --git a/example/speech_recognition/main.py b/example/speech_recognition/main.py index f041af4a95d2..e45026343de7 100644 --- a/example/speech_recognition/main.py +++ b/example/speech_recognition/main.py @@ -286,6 +286,7 @@ def load_model(args, contexts, data_train): 'train or predict or load can be the candidate for the mode.') # get meta file where character to number conversions are defined + contexts = parse_contexts(args) num_gpu = len(contexts) batch_size = args.config.getint('common', 'batch_size') diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index e95613843aee..035ce08ac099 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -51,7 +51,7 @@ class Gc { /*! * \brief sets gradient compression to given mode * Active mode is when gradients are compressed - * Gc is in inactive mode during init of parameters + * Compression is in inactive mode during init of parameters */ void set_active(bool active); @@ -65,6 +65,10 @@ class Gc { */ CompressionType get_type(); + /*! + * \brief sets two bit gradient compression + * \param threshold float value used for thresholding gradients + */ void SetTwoBitCompression(const float threshold); /*! diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 5389434fcecc..880cb28f217f 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -68,7 +68,7 @@ class KVStore { /** * \brief Set parameters to use low-bit compressed gradients * \param compression_type type of compression - * \param threshold set the threshold for 2bit compression + * \param threshold threshold for 2bit compression */ virtual void SetGradientCompression(const std::string& compression_type, const float threshold) = 0; @@ -396,9 +396,9 @@ class KVStore { */ std::string type_; - /** \brief gradient compression - * starts with none, used after SetGradientCompression sets the type - * currently there is no support for unsetting gradient compression + /** \brief Gradient compression object starts with GC_NONE mode + * Used if SetGradientCompression sets the type. + * Currently there is no support for un-setting gradient compression */ kvstore::Gc* gc_; diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 80cf6b93981d..4abd292f7013 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -361,9 +361,8 @@ def set_gradient_compression(self, compression_params=None): and specifies the type of gradient compression. Other keys in this dictionary are optional and specific to the type of gradient compression. - 2bit Gradient Compression: - - 2bit gradient compression takes a threshold. This needs to be a positive float. + 2bit Gradient Compression + 2bit gradient compression takes a threshold. This must be a positive float. The technique works by limiting values such that the absolute values of the gradient communicated is less than the threshold. Values which don't meet the threshold are set to 0. @@ -394,7 +393,7 @@ def set_gradient_compression(self, compression_params=None): Only specifying `compression` would use default value for the threshold. To completely specify the arguments for 2bit compression, we would need to pass a dictionary which includes `threshold` like: - {'compression':'2bit', 'threshold':0.5} + {'compression': '2bit', 'threshold': 0.5} compression: str type of low-bit quantization to be used for gradient compression diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 7f7b272fcdeb..2444ca0dc59e 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -98,6 +98,7 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_o for idx, param_on_devs in enumerate(param_arrays): name = param_names[idx] kvstore.init(name, arg_params[name]) + if update_on_kvstore: kvstore.pull(name, param_on_devs, priority=-idx) diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py index 1396eb575262..5ebc455f6a29 100644 --- a/python/mxnet/module/bucketing_module.py +++ b/python/mxnet/module/bucketing_module.py @@ -52,6 +52,11 @@ class BucketingModule(BaseModule): state_names : list of str States are similar to data and label, but not provided by data iterator. Instead they are initialized to 0 and can be set by set_states() + compression_params : dict + Specifies type of gradient compression and additional arguments depending + on the type of compression being used. For example, 2bit compression requires a threshold. + Arguments would then be {'compression':'2bit', 'threshold':0.5} + See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. """ def __init__(self, sym_gen, default_bucket_key=None, logger=logging, context=ctx.cpu(), work_load_list=None, diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 2daa0cb63925..09f8a830eee6 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -64,7 +64,6 @@ class Module(BaseModule): on the type of compression being used. For example, 2bit compression requires a threshold. Arguments would then be {'compression':'2bit', 'threshold':0.5} See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. - """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), logger=logging, context=ctx.cpu(), work_load_list=None, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 78fdd8eabee8..94525eb9fe8b 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -734,9 +734,9 @@ int MXKVStoreCreate(const char *type, } int MXKVStoreSetGradientCompression(KVStoreHandle handle, - const char *compress, const float threshold) { + const char *compression, const float threshold) { API_BEGIN(); - static_cast(handle)->SetGradientCompression(compress, threshold); + static_cast(handle)->SetGradientCompression(compression, threshold); API_END(); } diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 04188dec203b..3768db61711a 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -30,7 +30,7 @@ #include #include #include "mxnet/ndarray.h" -#include +#include "mxnet/gc.h" #include "../ndarray/ndarray_function.h" #include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { @@ -491,7 +491,8 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_->get_type() != GC_NONE && gc_->is_active()) { + if (gc_->get_type() != GC_NONE) { + CHECK(gc_->is_active()); return ReduceCompressed(key, src, priority); } @@ -516,6 +517,7 @@ class CommDevice : public Comm { std::vector reduce(src.size()); CopyFromTo(src[0], &(buf.merged), priority); reduce[0] = buf.merged; + if (buf.copy_buf.empty()) { // TODO(mli) this results in large device memory usage for huge ndarray, // such as the largest fullc in VGG. consider to do segment reduce with @@ -531,6 +533,7 @@ class CommDevice : public Comm { CopyFromTo(src[i+1], &(buf.copy_buf[i]), priority); reduce[i+1] = buf.copy_buf[i]; } + ElementwiseSum(reduce, &buf.merged); return buf.merged; } @@ -698,7 +701,7 @@ class CommDevice : public Comm { NDArray merged; /// \brief the gpu buffer std::vector copy_buf; - /// \brief the residual buffer + /// \brief the residual buffer for gradient compression std::vector residual; /// \brief the small buffer for compressed data in sender std::vector small_send_buf; diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h index 1cabf1272e86..41300b829b59 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gc-inl.h @@ -18,7 +18,7 @@ */ /*! - * \file gc.cu + * \file gc-inl.h * \author Rahul Huilgol * \brief Declares and defines functions used to quantize and dequantize data */ @@ -47,8 +47,8 @@ struct quantize_2bit { // init to 0 *compr_block = 0; // start and end are indices in original grad array - int start = out_block_id << 4; - int end = (start + 16 <= original_size) ? start + 16 : original_size; + const int start = out_block_id << 4; + const int end = (start + 16 <= original_size) ? start + 16 : original_size; // cast as char* to manipulate bits of float addresses char *block_ptr = reinterpret_cast < char * > (compr_block); // masks to set bits when value meets pos_threshold @@ -105,10 +105,10 @@ struct dequantize_2bit { const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02}; // col denotes which two bits of a byte are set for this value // col=0 implies first two bits, col=3 implies last two bits,... - int col = i & 3; - uint8_t mask = posbits[col]; - uint8_t negmask = negbits[col]; - uint8_t masked = *ch_ptr & mask; + const int col = i & 3; + const uint8_t mask = posbits[col]; + const uint8_t negmask = negbits[col]; + const uint8_t masked = *ch_ptr & mask; if (masked == mask) { *outval = pos_threshold; } else if (masked == negmask) { diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index e152e3c4cc8a..f8a4253b88fd 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -22,11 +22,10 @@ * \brief Gradient compression for kvstore * \author Rahul Huilgol */ + #include #include #include -// for get_rank -#include #include "./gc-inl.h" namespace mxnet { @@ -63,7 +62,8 @@ void Gc::SetParams(const std::string &compression_type, const float threshold) { void Gc::set_active(bool active) { active_ = active; } -// can be active when type is none + +// note that this can be active when type is none, it denotes init is done for now bool Gc::is_active() { return active_; } @@ -114,9 +114,11 @@ int64_t Gc::GetCompressedSize(const int64_t original_size) { void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, mxnet::NDArray *residual, const int priority) { - CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; - int a = from.ctx().dev_mask(); - int b = to->ctx().dev_mask(); + CHECK(from.shape().ndim() != 0) << "source operand has zero dimension shape"; + CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape"; + CHECK(residual->shape().ndim() != 0) << "residual operand has zero dimension shape"; + const int a = from.ctx().dev_mask(); + const int b = to->ctx().dev_mask(); const float threshold = threshold_; if (type_ == GC_TWO_BIT) { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { @@ -148,7 +150,8 @@ void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, } void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { - CHECK(from.shape().ndim() != 0) << "source operands have zero dimension shape"; + CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape"; + CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape"; const int a = from.ctx().dev_mask(); const int b = to->ctx().dev_mask(); const float threshold = threshold_; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 09f056a46627..1cc64782dd22 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -88,9 +88,9 @@ class KVStoreDist : public KVStoreLocal { void SetGradientCompression(const std::string& compression_type, const float threshold) override { KVStoreLocal::SetGradientCompression(compression_type, threshold); if (get_rank() == 0) { + // only rank 0 because init happens by rank 0 only SendCommandToServers(kSetGradientCompression, gc_->EncodeParams()); } - //TODO barrier? } void Barrier() override { @@ -223,6 +223,8 @@ class KVStoreDist : public KVStoreLocal { RunContext rctx, Engine::CallbackOnComplete cb) { // convert to ps keys size_t size = recv_buf.shape().Size(); + + // even if inactive should use compressed_pskv for pull when type is not none PSKV& pskv = (gc_->get_type() == GC_NONE) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); @@ -305,6 +307,7 @@ class KVStoreDist : public KVStoreLocal { GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); // set active for non init pushes + // do_merge is proxy for non-init push if (do_merge && !gc_->is_active()) gc_->set_active(true); for (size_t i = 0; i < uniq_keys.size(); ++i) { @@ -339,8 +342,7 @@ class KVStoreDist : public KVStoreLocal { PushDefault(key, comm_buf, pskv, priority); } else { // returns push_pskv if active, else pull_pskv - // we want inactive gc to send uncompressed gradients, but sharded same as active gc - // but calculates both push and pull pskv + // we want inactive gc to send uncompressed gradients, but sharded in the same way as active gc PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->is_active()); if (gc_->is_active()) { PushCompressed(key, comm_buf, pskv, priority); @@ -373,7 +375,7 @@ class KVStoreDist : public KVStoreLocal { } gc_->Quantize(comm_buf, &small_buf, &res_buf, priority); auto push_to_servers = - [this, key, pskv, comm_buf, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { + [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { size_t size = small_buf.shape().Size(); real_t* data = small_buf.data().dptr(); #if MKL_EXPERIMENTAL == 1 diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 42d15ba1ae2e..10f7ab44b554 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -492,9 +492,9 @@ class KVStoreDistServer { ApplyUpdates(key, &merged, &stored, server); } else { // async push - exec_.Exec([this, key, &recved, &stored]() { - CHECK(updater_); - updater_(key, recved, &stored); + exec_.Exec([this, key, &recved, &stored](){ + CHECK(updater_); + updater_(key, recved, &stored); }); server->Response(req_meta); stored.WaitToRead(); diff --git a/tools/launch.py b/tools/launch.py index 0372a03bb350..de42ea2a7dd3 100755 --- a/tools/launch.py +++ b/tools/launch.py @@ -35,8 +35,7 @@ def dmlc_opts(opts): '--num-servers', str(opts.num_servers), '--cluster', opts.launcher, '--host-file', opts.hostfile, - '--sync-dst-dir', opts.sync_dst_dir, - '--pass-env', opts.pass_env] + '--sync-dst-dir', opts.sync_dst_dir] args += opts.command; try: from dmlc_tracker import opts @@ -65,9 +64,6 @@ def main(): parser.add_argument('--launcher', type=str, default='ssh', choices = ['local', 'ssh', 'mpi', 'sge', 'yarn'], help = 'the launcher to use') - parser.add_argument('--pass-env', type=str, default='', - help = 'given a comma separated list of environment \ - variables, passes their values while launching job') parser.add_argument('command', nargs='+', help = 'command for launching the program') args, unknown = parser.parse_known_args() From b60b3fb431f41dbfd8a8fa999821cc0f24f441c3 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 01:25:00 -0800 Subject: [PATCH 215/237] fix lint issues --- include/mxnet/gc.h | 14 +++++------ src/kvstore/comm.h | 2 -- src/kvstore/gc-inl.h | 40 ++++++++++++++++++------------- src/kvstore/gc.cc | 20 ++++++++-------- src/kvstore/gc.cu | 10 ++++---- src/kvstore/kvstore_dist.h | 12 ++++++---- src/kvstore/kvstore_dist_server.h | 2 +- src/kvstore/kvstore_local.h | 2 +- 8 files changed, 56 insertions(+), 46 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 035ce08ac099..943904ca27d0 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -23,8 +23,8 @@ * \author Rahul Huilgol */ -#ifndef MXNET_KVSTORE_GC_H -#define MXNET_KVSTORE_GC_H +#ifndef MXNET_GC_H +#define MXNET_GC_H #include #include"./ndarray.h" @@ -36,7 +36,7 @@ enum CompressionType { }; class Gc { -public: + public: Gc(); virtual ~Gc() {} @@ -113,7 +113,7 @@ class Gc { */ void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority); -private: + private: /*! * \brief denotes the type of gradient compression which has been set */ @@ -133,6 +133,6 @@ class Gc { */ float threshold_ = 0; }; -} // namespace kvstore -} // namespace mxnet -#endif // MXNET_KVSTORE_GC_H +} // namespace kvstore +} // namespace mxnet +#endif // MXNET_GC_H diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 3768db61711a..baa90783f5c1 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -92,7 +92,6 @@ class Comm { Context pinned_ctx_; Gc* gc_; - }; /** @@ -490,7 +489,6 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_->get_type() != GC_NONE) { CHECK(gc_->is_active()); return ReduceCompressed(key, src, priority); diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h index 41300b829b59..91e7df8adbb2 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gc-inl.h @@ -22,11 +22,14 @@ * \author Rahul Huilgol * \brief Declares and defines functions used to quantize and dequantize data */ +#ifndef MXNET_KVSTORE_GC_INL_H_ +#define MXNET_KVSTORE_GC_INL_H_ +#include #include "../operator/mxnet_op.h" namespace mxnet { -namespace kvstore{ +namespace kvstore { // these gpu functions are defined in gc.cu void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, @@ -42,7 +45,8 @@ struct quantize_2bit { float *residual, const float neg_threshold, const float pos_threshold) { - // this block contains the compressed representation of upto 16 values starting from out_block_id*16 + // this block contains the compressed representation of + // upto 16 values starting from out_block_id*16 float *compr_block = out + out_block_id; // init to 0 *compr_block = 0; @@ -79,14 +83,15 @@ struct quantize_2bit { template void Quantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, - inputs[2].Size(), // compressed array size - inputs[0].Size(), // original size - inputs[2].dptr(), // compressed array - inputs[0].dptr(), // original array - inputs[1].dptr(), // residual array - -1 *threshold, // negative threshold - threshold); // positive threshold + mxnet::op::mxnet_op::Kernel + ::Launch(s, + inputs[2].Size(), // compressed array size + inputs[0].Size(), // original size + inputs[2].dptr(), // compressed array + inputs[0].dptr(), // original array + inputs[1].dptr(), // residual array + -1 *threshold, // negative threshold + threshold); // positive threshold } struct dequantize_2bit { @@ -124,12 +129,13 @@ struct dequantize_2bit { template void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector &inputs, const float threshold) { - mxnet::op::mxnet_op::Kernel::Launch(s, - inputs[1].Size(), // original size - inputs[1].dptr(), // out array - inputs[0].dptr(), // compressed array - -1 *threshold, // negative threshold - threshold); // positive threshold + mxnet::op::mxnet_op::Kernel + ::Launch(s, + inputs[1].Size(), // original size + inputs[1].dptr(), // out array + inputs[0].dptr(), // compressed array + -1 *threshold, // negative threshold + threshold); // positive threshold } inline void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, @@ -143,3 +149,5 @@ inline void Dequantize2BitImpl(mshadow::Stream *s, const std::vect } } // namespace kvstore } // namespace mxnet + +#endif // MXNET_KVSTORE_GC_INL_H_ diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index f8a4253b88fd..9874dce4bd24 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -123,10 +123,10 @@ void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, if (type_ == GC_TWO_BIT) { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU")); } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { @@ -158,10 +158,10 @@ void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int pr if (type_ == GC_TWO_BIT) { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU")); } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { @@ -184,6 +184,6 @@ void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int pr } } -} // namespace kvstore -} // namespace mxnet +} // namespace kvstore +} // namespace mxnet diff --git a/src/kvstore/gc.cu b/src/kvstore/gc.cu index 9279e6a1f176..fee944db33bf 100644 --- a/src/kvstore/gc.cu +++ b/src/kvstore/gc.cu @@ -27,12 +27,14 @@ namespace mxnet { namespace kvstore { -void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { +void Quantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { Quantize2BitKernelLaunch(s, inputs, threshold); } -void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, const float threshold) { +void Dequantize2BitImpl(mshadow::Stream* s, const std::vector& inputs, + const float threshold) { Dequantize2BitKernelLaunch(s, inputs, threshold); } -} // namespace kvstore -} // namespace mxnet \ No newline at end of file +} // namespace kvstore +} // namespace mxnet diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 1cc64782dd22..182e23e3ae51 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -342,7 +342,8 @@ class KVStoreDist : public KVStoreLocal { PushDefault(key, comm_buf, pskv, priority); } else { // returns push_pskv if active, else pull_pskv - // we want inactive gc to send uncompressed gradients, but sharded in the same way as active gc + // we want inactive gc to send uncompressed gradients, + // but sharded in the same way as active gc PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->is_active()); if (gc_->is_active()) { PushCompressed(key, comm_buf, pskv, priority); @@ -361,7 +362,7 @@ class KVStoreDist : public KVStoreLocal { } } - void PushCompressed(int key, const NDArray& comm_buf, PSKV& pskv, int priority) { + void PushCompressed(int key, const NDArray& comm_buf, const PSKV& pskv, int priority) { auto &small_buf = compr_buf_[key]; auto &res_buf = residual_[key]; size_t original_size = comm_buf.shape().Size(); @@ -605,12 +606,13 @@ class KVStoreDist : public KVStoreLocal { for (int i = 0; i < num_servers; ++i) { size_t part_compr, part_orig; - if(i==num_servers-1){ + if (i == num_servers-1) { part_compr = compr_size - push_pskv.size; part_orig = original_size - pull_pskv.size; } else { - part_compr = static_cast (round(static_cast(compr_size)/num_servers*(i+1))) - - static_cast (round(static_cast(compr_size)/num_servers*(i))); + part_compr = + static_cast (round(static_cast(compr_size)/num_servers*(i+1))) - + static_cast (round(static_cast(compr_size)/num_servers*(i))); part_orig = part_compr * gc_->GetCompressionFactor(); } diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 10f7ab44b554..c40fb585329b 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -366,7 +366,7 @@ class KVStoreDistServer { } } - void DefaultStorageResponse(int key, NDArray& stored, + void DefaultStorageResponse(int key, const NDArray& stored, const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index e89c2db51f22..d8ca30bbc9c3 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -141,7 +141,7 @@ class KVStoreLocal : public KVStore { gc_->SetParams(compression_type, threshold); } -private: + private: virtual void InitImpl(const std::vector& keys, const std::vector& values) { for (size_t i = 0; i < keys.size(); ++i) { From 83289239c862c4336853841977dafcf34160291a Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 08:36:39 -0800 Subject: [PATCH 216/237] fix lint issues --- include/mxnet/gc.h | 6 +++--- src/kvstore/gc-inl.h | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/mxnet/gc.h b/include/mxnet/gc.h index 943904ca27d0..20f3c94cef51 100644 --- a/include/mxnet/gc.h +++ b/include/mxnet/gc.h @@ -23,8 +23,8 @@ * \author Rahul Huilgol */ -#ifndef MXNET_GC_H -#define MXNET_GC_H +#ifndef MXNET_GC_H_ +#define MXNET_GC_H_ #include #include"./ndarray.h" @@ -135,4 +135,4 @@ class Gc { }; } // namespace kvstore } // namespace mxnet -#endif // MXNET_GC_H +#endif // MXNET_GC_H_ diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gc-inl.h index 91e7df8adbb2..2723c4b33745 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gc-inl.h @@ -138,16 +138,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream *s, const std::vector *s, const std::vector &inputs, - const float threshold) { +inline void Quantize2BitImpl(mshadow::Stream *s, + const std::vector &inputs, + const float threshold) { Quantize2BitKernelLaunch(s, inputs, threshold); } -inline void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, - const float threshold) { +inline void Dequantize2BitImpl(mshadow::Stream *s, + const std::vector &inputs, + const float threshold) { Dequantize2BitKernelLaunch(s, inputs, threshold); } -} // namespace kvstore -} // namespace mxnet +} // namespace kvstore +} // namespace mxnet #endif // MXNET_KVSTORE_GC_INL_H_ From aa242b8063def9eeaafce58c85c864bd0d341596 Mon Sep 17 00:00:00 2001 From: Rahul Date: Wed, 15 Nov 2017 08:50:15 -0800 Subject: [PATCH 217/237] remove caltech --- example/image-classification/train_caltech.py | 59 ------------------- 1 file changed, 59 deletions(-) delete mode 100644 example/image-classification/train_caltech.py diff --git a/example/image-classification/train_caltech.py b/example/image-classification/train_caltech.py deleted file mode 100644 index 53e5702b3d13..000000000000 --- a/example/image-classification/train_caltech.py +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os -import argparse -import logging -logging.basicConfig(level=logging.DEBUG) -from common import find_mxnet, data, fit - -if __name__ == '__main__': - train_fname = os.path.join("data", "caltech-256-60-val.rec") - val_fname = os.path.join("data", "caltech-256-60-train.rec") - # parse args - parser = argparse.ArgumentParser(description="train caltech256", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - fit.add_fit_args(parser) - data.add_data_args(parser) - data.add_data_aug_args(parser) - data.set_data_aug_level(parser, 2) - parser.set_defaults( - # network - network = 'mlp', - num_layers = 110, - # data - data_train = train_fname, - data_val = val_fname, - num_classes = 256, - num_examples = 15420, - image_shape = '3,256,256', - pad_size = 4, - # train - batch_size = 128, - num_epochs = 300, - lr = .05, - lr_step_epochs = '200,250', - ) - args = parser.parse_args() - - # load network - from importlib import import_module - net = import_module('symbols.'+args.network) - sym = net.get_symbol(**vars(args)) - - # train - fit.fit(args, sym, data.get_rec_iter) From 62c52552f3530795d42c83d11a00fff90bd6e98c Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 17:13:04 -0800 Subject: [PATCH 218/237] address some comments: shared_ptr, documentation, indentaion, new functions, check_eq --- include/mxnet/kvstore.h | 2 +- python/mxnet/kvstore.py | 38 +++++++++-------------- src/kvstore/comm.h | 62 +++++++++++++++++-------------------- src/kvstore/gc.cc | 42 ++++++++++++------------- src/kvstore/kvstore_dist.h | 5 ++- src/kvstore/kvstore_local.h | 3 +- 6 files changed, 67 insertions(+), 85 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 880cb28f217f..21c87fda6b0e 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -400,7 +400,7 @@ class KVStore { * Used if SetGradientCompression sets the type. * Currently there is no support for un-setting gradient compression */ - kvstore::Gc* gc_; + std::shared_ptr gc_; /** diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 4abd292f7013..bb6358660443 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -349,7 +349,7 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): check_call(_LIB.MXKVStorePullRowSparse( self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_gradient_compression(self, compression_params=None): + def set_gradient_compression(self, compression_params=(('compression', '2bit'),)): """ Specifies type of low-bit quantization for gradient compression if any, and additional arguments depending on the type of compression being used. @@ -357,19 +357,22 @@ def set_gradient_compression(self, compression_params=None): ---------- compression_params : dict `compression_params` is a dictionary specifying the type and parameters - for gradient compression. The key `compression` in this dictionary is a required argument - and specifies the type of gradient compression. Other keys in this - dictionary are optional and specific to the type of gradient compression. - - 2bit Gradient Compression - 2bit gradient compression takes a threshold. This must be a positive float. - The technique works by limiting values such that the absolute values of the gradient - communicated is less than the threshold. Values which don't meet the threshold - are set to 0. + for gradient compression. The key `compression` in this dictionary is a + required string argument and specifies the type of gradient compression. + Other keys in this dictionary are optional and specific to the type + of gradient compression. Defaults to (('compression', '2bit'),). + The default value is not a dict, + just to avoid pylint warning on dangerous default values. + + 2bit Gradient Compression takes a positive float `threshold`. + The technique works by thresholding values such that positive values in the + gradient above threshold will be set to threshold. Negative values whose absolute + values are higher than threshold, will be set to the negative of threshold. + Values whose absolute values are less than threshold will be set to 0. By doing so, each value in the gradient is in one of three states. 2bits are used to represent these states, and every 16 float values in the original gradient can be represented using one float. This compressed representation - can reduce communication costs. The difference between these values and + can reduce communication costs. The difference between these thresholded values and original values is stored at the sender's end as residual and added to the gradient in the next iteration. @@ -395,19 +398,6 @@ def set_gradient_compression(self, compression_params=None): a dictionary which includes `threshold` like: {'compression': '2bit', 'threshold': 0.5} - compression: str - type of low-bit quantization to be used for gradient compression - Can only be '2bit' or `none` for now. - 2bit gradient compression uses 2bit quantization with residual to compress - gradients. It works by converts each value in the original gradient to use - 2 bits, causing size of gradient to be 1/16th of the original gradient - threshold: float - must be greater than 0 - threshold used for 2bit quantization of gradients - Positive values in gradient above threshold will be set to - threshold. Negative values whose absolute values are higher than threshold, - will be set to the negative of threshold. Values whose absolute values are - less than threshold will be set to 0. """ if compression_params: if not isinstance(compression_params, dict): diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index baa90783f5c1..8928fe0b7b5a 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -84,14 +84,14 @@ class Comm { * \brief Sets gradient compression parameters to be able to * perform reduce with compressed gradients */ - void SetGradientCompression(Gc* gc) { + void SetGradientCompression(std::shared_ptr gc) { gc_ = gc; } protected: Context pinned_ctx_; - Gc* gc_; + std::shared_ptr gc_; }; /** @@ -487,6 +487,19 @@ class CommDevice : public Comm { } } + void InitBuffersAndComm(const std::vector& src) { + if (!inited_) { + std::vector devs; + for (const auto& a : src) { + devs.push_back(a.ctx()); + } + InitMergeBuffer(devs); + if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) { + EnableP2P(devs); + } + } + } + const NDArray& Reduce(int key, const std::vector& src, int priority) override { if (gc_->get_type() != GC_NONE) { @@ -500,17 +513,7 @@ class CommDevice : public Comm { return src[0]; } - if (!inited_) { - std::vector devs; - for (const auto& a : src) { - devs.push_back(a.ctx()); - } - InitMergeBuffer(devs); - if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) { - EnableP2P(devs); - } - } - + InitBuffersAndComm(src); auto& buf = merge_buf_[key]; std::vector reduce(src.size()); CopyFromTo(src[0], &(buf.merged), priority); @@ -538,24 +541,15 @@ class CommDevice : public Comm { const NDArray& ReduceCompressed(int key, const std::vector& src, int priority) { - if (!inited_) { - std::vector devs; - for (const auto& a : src) { - devs.push_back(a.ctx()); - } - InitMergeBuffer(devs); - if (dmlc::GetEnv("MXNET_ENABLE_GPU_P2P", 1)) { - EnableP2P(devs); - } - } + InitBuffersAndComm(src); auto& buf = merge_buf_[key]; std::vector reduce(src.size()); if (buf.copy_buf.empty()) { // one buf for each context buf.copy_buf.resize(src.size()); - buf.small_recv_buf.resize(src.size()); - buf.small_send_buf.resize(src.size()); + buf.compressed_recv_buf.resize(src.size()); + buf.compressed_send_buf.resize(src.size()); buf.residual.resize(src.size()); for (size_t i = 0; i < src.size(); ++i) { @@ -565,9 +559,9 @@ class CommDevice : public Comm { false, buf.merged.dtype()); buf.residual[i] = 0; int64_t small_size = gc_->GetCompressedSize(buf.merged.shape().Size()); - buf.small_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), + buf.compressed_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(), false, buf.merged.dtype()); - buf.small_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), + buf.compressed_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(), false, buf.merged.dtype()); } } @@ -576,14 +570,14 @@ class CommDevice : public Comm { // compress before copy // this is done even if the data is on same context as copy_buf because // we don't want the training to be biased towards data on this GPU - gc_->Quantize(src[i], &(buf.small_send_buf[i]), &(buf.residual[i]), priority); - if (buf.small_send_buf[i].ctx() != buf.small_recv_buf[i].ctx()) { - CopyFromTo(buf.small_send_buf[i], &(buf.small_recv_buf[i]), priority); + gc_->Quantize(src[i], &(buf.compressed_send_buf[i]), &(buf.residual[i]), priority); + if (buf.compressed_send_buf[i].ctx() != buf.compressed_recv_buf[i].ctx()) { + CopyFromTo(buf.compressed_send_buf[i], &(buf.compressed_recv_buf[i]), priority); } else { // avoid memory copy when they are on same context - buf.small_recv_buf[i] = buf.small_send_buf[i]; + buf.compressed_recv_buf[i] = buf.compressed_send_buf[i]; } - gc_->Dequantize(buf.small_recv_buf[i], &(buf.copy_buf[i]), priority); + gc_->Dequantize(buf.compressed_recv_buf[i], &(buf.copy_buf[i]), priority); reduce[i] = buf.copy_buf[i]; } ElementwiseSum(reduce, &buf.merged); @@ -702,9 +696,9 @@ class CommDevice : public Comm { /// \brief the residual buffer for gradient compression std::vector residual; /// \brief the small buffer for compressed data in sender - std::vector small_send_buf; + std::vector compressed_send_buf; /// \brief the small buffer for compressed data in receiver - std::vector small_recv_buf; + std::vector compressed_recv_buf; }; std::unordered_map merge_buf_; bool inited_; diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 9874dce4bd24..01b0441e54a0 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -130,18 +130,18 @@ void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), residual->data(), to->data()}; - Quantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var(), residual->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } + mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), residual->data(), to->data()}; + Quantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var(), residual->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } #else - LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif } } else { @@ -165,16 +165,16 @@ void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int pr } else { #if MXNET_USE_CUDA if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) { - mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { - std::vector inputs = {from.data(), to->data()}; - Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); - }, from.ctx(), {from.var()}, {to->var()}, - mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); - } else { - LOG(FATAL) << "unknown device mask"; - } + mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { + std::vector inputs = {from.data(), to->data()}; + Dequantize2BitImpl(ctx.get_stream(), inputs, threshold); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), {from.var()}, {to->var()}, + mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU")); + } else { + LOG(FATAL) << "unknown device mask"; + } #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 182e23e3ae51..20ee339f7e24 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -352,9 +352,8 @@ class KVStoreDist : public KVStoreLocal { } } } else if (storage_type == kRowSparseStorage) { - if (gc_->get_type() != GC_NONE) { - LOG(FATAL) << "Gradient compression for row sparse storage type is not supported"; - } + CHECK_EQ(gc_->get_type(), GC_NONE) + << "Gradient compression for row sparse storage type is not supported"; PushRowSparse(key, comm_buf, priority); } else { LOG(FATAL) << "unknown storage type"; diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index d8ca30bbc9c3..2cd916050bd3 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -58,12 +58,11 @@ class KVStoreLocal : public KVStore { comm_ = new CommCPU(); } pinned_ctx_ = comm_->pinned_ctx(); - gc_ = new Gc(); + gc_ = std::make_shared(); } virtual ~KVStoreLocal() { delete comm_; - delete gc_; } void Init(const std::vector& keys, From b8b1d66c16061b2e87871370d985eacd6a2bb24b Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 17:18:20 -0800 Subject: [PATCH 219/237] move header --- include/mxnet/kvstore.h | 2 +- src/kvstore/comm.h | 2 +- src/kvstore/gc.cc | 4 ++-- {include/mxnet => src/kvstore}/gc.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) rename {include/mxnet => src/kvstore}/gc.h (99%) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 21c87fda6b0e..a19ff798c654 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,7 +30,7 @@ #include #include #include -#include "./gc.h" +#include "kvstore/gc.h" #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 8928fe0b7b5a..07a4ee60b5a4 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -30,7 +30,7 @@ #include #include #include "mxnet/ndarray.h" -#include "mxnet/gc.h" +#include "gc.h" #include "../ndarray/ndarray_function.h" #include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { diff --git a/src/kvstore/gc.cc b/src/kvstore/gc.cc index 01b0441e54a0..71b40145f69c 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gc.cc @@ -23,10 +23,10 @@ * \author Rahul Huilgol */ -#include #include #include -#include "./gc-inl.h" +#include "gc.h" +#include "gc-inl.h" namespace mxnet { namespace kvstore { diff --git a/include/mxnet/gc.h b/src/kvstore/gc.h similarity index 99% rename from include/mxnet/gc.h rename to src/kvstore/gc.h index 20f3c94cef51..7cf5c0450624 100644 --- a/include/mxnet/gc.h +++ b/src/kvstore/gc.h @@ -26,7 +26,7 @@ #ifndef MXNET_GC_H_ #define MXNET_GC_H_ #include -#include"./ndarray.h" +#include"mxnet/ndarray.h" namespace mxnet { namespace kvstore { From b66a3f2541e883d6e44a4287adf2e370f55ca3f0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 18:02:07 -0800 Subject: [PATCH 220/237] include header corrected --- include/mxnet/kvstore.h | 2 +- src/kvstore/comm.h | 2 ++ src/kvstore/gc.h | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index a19ff798c654..7b62d39cac95 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,7 +30,7 @@ #include #include #include -#include "kvstore/gc.h" +#include "../../src/kvstore/gc.h" #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 07a4ee60b5a4..bff10fd876d4 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -571,12 +571,14 @@ class CommDevice : public Comm { // this is done even if the data is on same context as copy_buf because // we don't want the training to be biased towards data on this GPU gc_->Quantize(src[i], &(buf.compressed_send_buf[i]), &(buf.residual[i]), priority); + if (buf.compressed_send_buf[i].ctx() != buf.compressed_recv_buf[i].ctx()) { CopyFromTo(buf.compressed_send_buf[i], &(buf.compressed_recv_buf[i]), priority); } else { // avoid memory copy when they are on same context buf.compressed_recv_buf[i] = buf.compressed_send_buf[i]; } + gc_->Dequantize(buf.compressed_recv_buf[i], &(buf.copy_buf[i]), priority); reduce[i] = buf.copy_buf[i]; } diff --git a/src/kvstore/gc.h b/src/kvstore/gc.h index 7cf5c0450624..e0801d5b795f 100644 --- a/src/kvstore/gc.h +++ b/src/kvstore/gc.h @@ -23,8 +23,8 @@ * \author Rahul Huilgol */ -#ifndef MXNET_GC_H_ -#define MXNET_GC_H_ +#ifndef MXNET_KVSTORE_GC_H_ +#define MXNET_KVSTORE_GC_H_ #include #include"mxnet/ndarray.h" @@ -135,4 +135,4 @@ class Gc { }; } // namespace kvstore } // namespace mxnet -#endif // MXNET_GC_H_ +#endif // MXNET_KVSTORE_GC_H_ From f32b391ed94ad4b7b50a9d9f33926d626da24b93 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 18:07:51 -0800 Subject: [PATCH 221/237] include header corrected --- include/mxnet/kvstore.h | 2 +- src/kvstore/comm.h | 2 +- src/kvstore/{gc-inl.h => gradient_compression-inl.h} | 8 ++++---- src/kvstore/{gc.cc => gradient_compression.cc} | 6 +++--- src/kvstore/{gc.cu => gradient_compression.cu} | 4 ++-- src/kvstore/{gc.h => gradient_compression.h} | 8 ++++---- 6 files changed, 15 insertions(+), 15 deletions(-) rename src/kvstore/{gc-inl.h => gradient_compression-inl.h} (97%) rename src/kvstore/{gc.cc => gradient_compression.cc} (98%) rename src/kvstore/{gc.cu => gradient_compression.cu} (94%) rename src/kvstore/{gc.h => gradient_compression.h} (96%) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index 7b62d39cac95..d6aa670a0db9 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -30,7 +30,7 @@ #include #include #include -#include "../../src/kvstore/gc.h" +#include "../../src/kvstore/gradient_compression.h" #include "./ndarray.h" #if MXNET_USE_DIST_KVSTORE #include "ps/ps.h" diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index bff10fd876d4..815c6bc0ff8e 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -30,7 +30,7 @@ #include #include #include "mxnet/ndarray.h" -#include "gc.h" +#include "gradient_compression.h" #include "../ndarray/ndarray_function.h" #include "../operator/tensor/sparse_retain-inl.h" namespace mxnet { diff --git a/src/kvstore/gc-inl.h b/src/kvstore/gradient_compression-inl.h similarity index 97% rename from src/kvstore/gc-inl.h rename to src/kvstore/gradient_compression-inl.h index 2723c4b33745..3e182232f47e 100644 --- a/src/kvstore/gc-inl.h +++ b/src/kvstore/gradient_compression-inl.h @@ -18,12 +18,12 @@ */ /*! - * \file gc-inl.h + * \file gradient_compression-inl.h * \author Rahul Huilgol * \brief Declares and defines functions used to quantize and dequantize data */ -#ifndef MXNET_KVSTORE_GC_INL_H_ -#define MXNET_KVSTORE_GC_INL_H_ +#ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_ +#define MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_ #include #include "../operator/mxnet_op.h" @@ -152,4 +152,4 @@ inline void Dequantize2BitImpl(mshadow::Stream *s, } // namespace kvstore } // namespace mxnet -#endif // MXNET_KVSTORE_GC_INL_H_ +#endif // MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_ diff --git a/src/kvstore/gc.cc b/src/kvstore/gradient_compression.cc similarity index 98% rename from src/kvstore/gc.cc rename to src/kvstore/gradient_compression.cc index 71b40145f69c..43926b54cce0 100644 --- a/src/kvstore/gc.cc +++ b/src/kvstore/gradient_compression.cc @@ -18,15 +18,15 @@ */ /*! - * \file gc.cc + * \file gradient_compression.cc * \brief Gradient compression for kvstore * \author Rahul Huilgol */ #include #include -#include "gc.h" -#include "gc-inl.h" +#include "gradient_compression.h" +#include "gradient_compression-inl.h" namespace mxnet { namespace kvstore { diff --git a/src/kvstore/gc.cu b/src/kvstore/gradient_compression.cu similarity index 94% rename from src/kvstore/gc.cu rename to src/kvstore/gradient_compression.cu index fee944db33bf..b0d9662520b2 100644 --- a/src/kvstore/gc.cu +++ b/src/kvstore/gradient_compression.cu @@ -18,12 +18,12 @@ */ /*! - * \file gc.cu + * \file gradient_compression.cu * \author Rahul Huilgol * \brief Implementation for gpu version of code */ -#include "./gc-inl.h" +#include "gradient_compression-inl.h" namespace mxnet { namespace kvstore { diff --git a/src/kvstore/gc.h b/src/kvstore/gradient_compression.h similarity index 96% rename from src/kvstore/gc.h rename to src/kvstore/gradient_compression.h index e0801d5b795f..804547a04ebe 100644 --- a/src/kvstore/gc.h +++ b/src/kvstore/gradient_compression.h @@ -18,13 +18,13 @@ */ /*! - * \file gc.h + * \file gradient_compression.h * \brief Gradient compression for kvstore * \author Rahul Huilgol */ -#ifndef MXNET_KVSTORE_GC_H_ -#define MXNET_KVSTORE_GC_H_ +#ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ +#define MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ #include #include"mxnet/ndarray.h" @@ -135,4 +135,4 @@ class Gc { }; } // namespace kvstore } // namespace mxnet -#endif // MXNET_KVSTORE_GC_H_ +#endif // MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ From 0743f60ad985e32ce20a89d7e22f9b1250989e10 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Wed, 15 Nov 2017 22:51:32 -0800 Subject: [PATCH 222/237] indents, documentation and test update --- src/kvstore/comm.h | 2 +- src/kvstore/kvstore_dist.h | 10 +-- src/kvstore/kvstore_dist_server.h | 4 +- tests/nightly/dist_sync_kvstore.py | 69 +++++------------- tests/nightly/test_kvstore.py | 109 ++++++++++++++++------------- 5 files changed, 87 insertions(+), 107 deletions(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 815c6bc0ff8e..248c04643e35 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -540,7 +540,7 @@ class CommDevice : public Comm { } const NDArray& ReduceCompressed(int key, const std::vector& src, - int priority) { + int priority) { InitBuffersAndComm(src); auto& buf = merge_buf_[key]; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 20ee339f7e24..fc9a0d86a7bc 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -156,10 +156,12 @@ class KVStoreDist : public KVStoreLocal { /** * \brief cache all key partitions * - * `ps_kv_` is used for row sparse - * - * `push_ps_kv_` and `pull_ps_kv_`, used for default type gradients, are same - * when there is no gradient compression + * `ps_kv_` is used for pushes and pulls without gradient compression + * `compr_ps_kv_` is used for gradient compression. It contains different + * pskv for push and pull because sizes would be different in both cases. + * Note: `ps_kv_[k]` for some key k may not be the same as `compr_ps_kv_[k].pull` + * This is because sharding may cause slightly different divisions when size is + * not perfectly divisible. */ std::unordered_map ps_kv_; std::unordered_map compr_ps_kv_; diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index c40fb585329b..d7fc6e8ce0ae 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -381,8 +381,8 @@ class KVStoreDistServer { } void DataHandleCompressed(const ps::KVMeta& req_meta, - const ps::KVPairs &req_data, - ps::KVServer* server) { + const ps::KVPairs &req_data, + ps::KVServer* server) { CHECK_EQ(req_meta.cmd, kCompressedPushPull); if (req_meta.push) { // there used several WaitToRead, this is because \a recved's memory diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index cbb1c3a51806..5a9bfc12417d 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -24,6 +24,7 @@ import numpy as np import numpy.random as rnd from mxnet.test_utils import assert_almost_equal +from test_kvstore import compute_expected_2bit_quantization def check_diff_to_scalar(A, x, rank=None): """ assert A == x""" @@ -65,6 +66,8 @@ def init_kv_compressed(kv): kv.init('11221', mx.nd.zeros(big_shape)) kv.init('112221', mx.nd.zeros(irregular_shape)) kv.init('1121', mx.nd.zeros(shape)) + # to test inactive mode + kv.init('1122', mx.nd.ones(shape)) return kv, threshold def test_sync_push_pull(): @@ -183,51 +186,6 @@ def check_big_row_sparse_keys(kv, my_rank, nworker): expected[row] = updated_val[row] check_diff_to_scalar(val, expected, rank=my_rank) - def compute_expected(arr, curr_residual, threshold): - from struct import pack,unpack - def bits2int(bits): - bits = [int(x) for x in bits[::-1]] - x = 0 - for i in range(len(bits)): - x += bits[i]*2**i - return x - - def as_float32(s): - return unpack("f",pack("I", bits2int(s)))[0] - - # str_quant stores the quantized representation as a sequence of bits - str_quant = '' - new_residual = [] - decompr = [] - arr_npy = arr.asnumpy() - curr_res_npy = curr_residual.asnumpy() - for i, a in np.ndenumerate(arr_npy): - a += curr_res_npy[i] - if a >= threshold: - str_quant += '11' - new_residual.append(a - threshold) - decompr.append(threshold) - elif a <= (-1*threshold): - str_quant += '10' - new_residual.append(a + threshold) - decompr.append(-1*threshold) - else: - str_quant += '00' - new_residual.append(a) - decompr.append(0) - # append extra bits when size of array not a factor of 16 - if len(str_quant)%16 != 0: - str_quant += '0'*(16 - len(str_quant)%16) - - compr = [] - # converts the string generated into integers 32chars at a time - i = 0 - while i= threshold: + str_quant += '11' + new_residual.append(a - threshold) + decompr.append(threshold) + elif a <= (-1*threshold): + str_quant += '10' + new_residual.append(a + threshold) + decompr.append(-1*threshold) + else: + str_quant += '00' + new_residual.append(a) + decompr.append(0) + # append extra bits when size of array not a factor of 16 + if len(str_quant)%16 != 0: + str_quant += '0'*(16 - len(str_quant)%16) + + compr = [] + # converts the string generated into integers 32chars at a time + i = 0 + while i= threshold: - str_quant += '11' - new_residual.append(a - threshold) - decompr.append(threshold) - elif a <= (-1*threshold): - str_quant += '10' - new_residual.append(a + threshold) - decompr.append(-1*threshold) - else: - str_quant += '00' - new_residual.append(a) - decompr.append(0) - # append extra bits when size of array not a factor of 16 - if len(str_quant)%16 != 0: - str_quant += '0'*(16 - len(str_quant)%16) - - compr = [] - # converts the string generated into integers 32chars at a time - i = 0 - while i Date: Wed, 15 Nov 2017 23:16:28 -0800 Subject: [PATCH 223/237] lint --- src/kvstore/comm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 248c04643e35..18eca602cf7e 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -541,7 +541,6 @@ class CommDevice : public Comm { const NDArray& ReduceCompressed(int key, const std::vector& src, int priority) { - InitBuffersAndComm(src); auto& buf = merge_buf_[key]; std::vector reduce(src.size()); From d7aea02e00fede5ea652a4aeb0bee9654d6956e4 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 00:00:45 -0800 Subject: [PATCH 224/237] pylint --- python/mxnet/kvstore.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index bb6358660443..cadd2b099911 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -406,7 +406,7 @@ def set_gradient_compression(self, compression_params=(('compression', '2bit'),) raise ValueError('compression_params requires `compression` to be set') elif not isinstance(compression_params['compression'], string_types): raise TypeError('compression must be a string') - elif compression_params['compression'] not in ['none','2bit']: + elif compression_params['compression'] not in ['none', '2bit']: raise ValueError('Unsupported type of compression') if compression_params['compression'] == '2bit': @@ -418,9 +418,9 @@ def set_gradient_compression(self, compression_params=(('compression', '2bit'),) else: compression_params['threshold'] = 0.5 - check_call(_LIB.MXKVStoreSetGradientCompression(self.handle, - c_str(compression_params['compression']), - mx_float(compression_params['threshold']))) + check_call(_LIB.MXKVStoreSetGradientCompression( + self.handle, c_str(compression_params['compression']), + mx_float(compression_params['threshold']))) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. From 40f71f85461272eea1a0301fa6ee0a4097f309af Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 01:30:14 -0800 Subject: [PATCH 225/237] rename class, fix local kvstore test, remove confusing active method --- include/mxnet/kvstore.h | 2 +- src/kvstore/comm.h | 9 ++-- src/kvstore/gradient_compression.cc | 30 ++++------- src/kvstore/gradient_compression.h | 25 ++------- src/kvstore/kvstore_dist.h | 35 ++++++------- src/kvstore/kvstore_dist_server.h | 16 +++--- src/kvstore/kvstore_local.h | 7 ++- tests/nightly/dist_sync_kvstore.py | 2 +- tests/nightly/test_kvstore.py | 78 ++++++++++++++++------------- 9 files changed, 89 insertions(+), 115 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index d6aa670a0db9..a3a4835067e9 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -400,7 +400,7 @@ class KVStore { * Used if SetGradientCompression sets the type. * Currently there is no support for un-setting gradient compression */ - std::shared_ptr gc_; + std::shared_ptr gradient_compression_; /** diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 18eca602cf7e..98f499359405 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -84,14 +84,16 @@ class Comm { * \brief Sets gradient compression parameters to be able to * perform reduce with compressed gradients */ - void SetGradientCompression(std::shared_ptr gc) { + void SetGradientCompression(std::shared_ptr gc) { gc_ = gc; + gc_set_ = true; } protected: Context pinned_ctx_; - std::shared_ptr gc_; + std::shared_ptr gc_; + bool gc_set_ = false; }; /** @@ -502,8 +504,7 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_->get_type() != GC_NONE) { - CHECK(gc_->is_active()); + if (gc_set_) { return ReduceCompressed(key, src, priority); } diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc index 43926b54cce0..fe6d39d942f3 100644 --- a/src/kvstore/gradient_compression.cc +++ b/src/kvstore/gradient_compression.cc @@ -48,36 +48,26 @@ void split(const std::string &s, const char delim, Out result) { } } -Gc::Gc() { +GradientCompression::GradientCompression() { type_ = GC_NONE; - active_ = false; } -void Gc::SetParams(const std::string &compression_type, const float threshold) { +void GradientCompression::SetParams(const std::string &compression_type, const float threshold) { if (compression_type == "2bit") { SetTwoBitCompression(threshold); } } -void Gc::set_active(bool active) { - active_ = active; -} - -// note that this can be active when type is none, it denotes init is done for now -bool Gc::is_active() { - return active_; -} - -CompressionType Gc::get_type() { +CompressionType GradientCompression::get_type() { return type_; } -void Gc::SetTwoBitCompression(const float threshold) { +void GradientCompression::SetTwoBitCompression(const float threshold) { type_ = GC_TWO_BIT; threshold_ = threshold; } -std::string Gc::EncodeParams() { +std::string GradientCompression::EncodeParams() { std::string rval = std::to_string(type_); if (type_ == GC_TWO_BIT) { rval += "," + std::to_string(threshold_); @@ -85,7 +75,7 @@ std::string Gc::EncodeParams() { return rval; } -void Gc::DecodeParams(const std::string &s) { +void GradientCompression::DecodeParams(const std::string &s) { std::vector elems; split(s, ',', std::back_inserter(elems)); type_ = static_cast(stoi(elems[0])); @@ -96,7 +86,7 @@ void Gc::DecodeParams(const std::string &s) { } } -int Gc::GetCompressionFactor() { +int GradientCompression::GetCompressionFactor() { if (type_ == GC_TWO_BIT) { return 16; } else { @@ -105,14 +95,14 @@ int Gc::GetCompressionFactor() { } } -int64_t Gc::GetCompressedSize(const int64_t original_size) { +int64_t GradientCompression::GetCompressedSize(const int64_t original_size) { const int bits = GetCompressionFactor(); return ((original_size % bits == 0) ? original_size / bits : original_size / bits + 1); } -void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, +void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, mxnet::NDArray *residual, const int priority) { CHECK(from.shape().ndim() != 0) << "source operand has zero dimension shape"; CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape"; @@ -149,7 +139,7 @@ void Gc::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to, } } -void Gc::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { +void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape"; CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape"; const int a = from.ctx().dev_mask(); diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h index 804547a04ebe..7116b0e1a386 100644 --- a/src/kvstore/gradient_compression.h +++ b/src/kvstore/gradient_compression.h @@ -35,11 +35,11 @@ enum CompressionType { GC_NONE, GC_TWO_BIT }; -class Gc { +class GradientCompression { public: - Gc(); + GradientCompression(); - virtual ~Gc() {} + virtual ~GradientCompression() {} /*! * \brief sets parameters for gradient compression @@ -48,18 +48,6 @@ class Gc { */ void SetParams(const std::string &compression_type, const float threshold); - /*! - * \brief sets gradient compression to given mode - * Active mode is when gradients are compressed - * Compression is in inactive mode during init of parameters - */ - void set_active(bool active); - - /*! - * \brief returns boolean whether or not gc is in active mode - */ - bool is_active(); - /*! * \brief returns type of compression if any */ @@ -119,13 +107,6 @@ class Gc { */ CompressionType type_; - /*! - * \brief denotes whether gradient compression is active - * Value starts with false because we don't want initialization of parameters to be compressed. - * That would lead to bad convergence results. Currently after initialization, gc becomes active. - */ - bool active_; - /*! * \brief denotes threshold used for quantization and dequantization * Must be a positive value. All positive gradients will be thresholded to `threshold_` and diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index fc9a0d86a7bc..a6efc6fc2df0 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -89,7 +89,7 @@ class KVStoreDist : public KVStoreLocal { KVStoreLocal::SetGradientCompression(compression_type, threshold); if (get_rank() == 0) { // only rank 0 because init happens by rank 0 only - SendCommandToServers(kSetGradientCompression, gc_->EncodeParams()); + SendCommandToServers(kSetGradientCompression, gradient_compression_->EncodeParams()); } } @@ -178,9 +178,6 @@ class KVStoreDist : public KVStoreLocal { comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } if (get_rank() == 0) { - // set inactive for inits - if (gc_->is_active()) gc_->set_active(false); - Push_(keys, values, 0, false); // wait until the push is finished for (const int key : keys) { @@ -227,7 +224,7 @@ class KVStoreDist : public KVStoreLocal { size_t size = recv_buf.shape().Size(); // even if inactive should use compressed_pskv for pull when type is not none - PSKV& pskv = (gc_->get_type() == GC_NONE) ? + PSKV& pskv = (gradient_compression_->get_type() == GC_NONE) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); #if MKL_EXPERIMENTAL == 1 @@ -237,7 +234,7 @@ class KVStoreDist : public KVStoreLocal { // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); // issue pull - int cmd = (gc_->get_type() != GC_NONE) ? kCompressedPushPull : kDefaultPushPull; + int cmd = (gradient_compression_->get_type() != GC_NONE) ? kCompressedPushPull : kDefaultPushPull; CHECK_NOTNULL(ps_worker_)->ZPull( pskv.keys, vals, &pskv.lens, cmd, [vals, cb](){ delete vals; cb(); }); }; @@ -308,10 +305,6 @@ class KVStoreDist : public KVStoreLocal { std::vector > grouped_vals; GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals); - // set active for non init pushes - // do_merge is proxy for non-init push - if (do_merge && !gc_->is_active()) gc_->set_active(true); - for (size_t i = 0; i < uniq_keys.size(); ++i) { // merge over devices int key = uniq_keys[i]; @@ -339,22 +332,26 @@ class KVStoreDist : public KVStoreLocal { // push to servers if (storage_type == kDefaultStorage) { - if (gc_->get_type() == GC_NONE) { + if (gradient_compression_->get_type() == GC_NONE) { PSKV& pskv = EncodeDefaultKey(key, comm_buf.shape().Size(), true); PushDefault(key, comm_buf, pskv, priority); } else { - // returns push_pskv if active, else pull_pskv + // Note: gradient compression uses `do_merge` as proxy to + // detect whether the push is initialization of a key or not. + // is_active is false when push is initialization of key + bool is_active = do_merge; + PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), is_active); + // Returns push_pskv if active, else pull_pskv // we want inactive gc to send uncompressed gradients, - // but sharded in the same way as active gc - PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), gc_->is_active()); - if (gc_->is_active()) { + // but sharded in the same way as later pushes would when gc becomes active + if (is_active) { PushCompressed(key, comm_buf, pskv, priority); } else { PushDefault(key, comm_buf, pskv, priority); } } } else if (storage_type == kRowSparseStorage) { - CHECK_EQ(gc_->get_type(), GC_NONE) + CHECK_EQ(gradient_compression_->get_type(), GC_NONE) << "Gradient compression for row sparse storage type is not supported"; PushRowSparse(key, comm_buf, priority); } else { @@ -375,7 +372,7 @@ class KVStoreDist : public KVStoreLocal { false, comm_buf.dtype()); res_buf = 0; } - gc_->Quantize(comm_buf, &small_buf, &res_buf, priority); + gradient_compression_->Quantize(comm_buf, &small_buf, &res_buf, priority); auto push_to_servers = [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { size_t size = small_buf.shape().Size(); @@ -566,7 +563,7 @@ class KVStoreDist : public KVStoreLocal { CHECK_GT(num_servers, 0); // represents size of data to be sent - size_t compr_size = gc_->GetCompressedSize(original_size); + size_t compr_size = gradient_compression_->GetCompressedSize(original_size); mu_.lock(); PSKV& pskv = (is_push) ? compr_ps_kv_[key].push : compr_ps_kv_[key].pull; @@ -614,7 +611,7 @@ class KVStoreDist : public KVStoreLocal { part_compr = static_cast (round(static_cast(compr_size)/num_servers*(i+1))) - static_cast (round(static_cast(compr_size)/num_servers*(i))); - part_orig = part_compr * gc_->GetCompressionFactor(); + part_orig = part_compr * gradient_compression_->GetCompressionFactor(); } // meta info diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index d7fc6e8ce0ae..b8bd1df55ce9 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -118,13 +118,13 @@ class KVStoreDistServer { ps_server_->set_request_handle( std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; - gc_ = new Gc(); + gradient_compression_ = new GradientCompression(); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } ~KVStoreDistServer() { delete ps_server_; - delete gc_; + delete gradient_compression_; } void set_controller(const KVStore::Controller& controller) { @@ -156,7 +156,7 @@ class KVStoreDistServer { } else if (recved.head == kSyncMode) { sync_mode_ = true; } else if (recved.head == kSetGradientCompression) { - gc_->DecodeParams(recved.body); + gradient_compression_->DecodeParams(recved.body); } else { // let the main thread to execute ctrl, which is necessary for python exec_.Exec([this, recved]() { @@ -413,7 +413,7 @@ class KVStoreDistServer { if (stored.is_none()) { stored = NDArray(dshape, Context()); - gc_->Dequantize(recved, &stored, 0); + gradient_compression_->Dequantize(recved, &stored, 0); server->Response(req_meta); stored.WaitToRead(); } else if (sync_mode_) { @@ -423,16 +423,16 @@ class KVStoreDistServer { merged.array = NDArray(dshape, Context()); } if (merged.request.size() == 0) { - gc_->Dequantize(recved, &merged.array, 0); + gradient_compression_->Dequantize(recved, &merged.array, 0); } else { - gc_->Dequantize(recved, &decomp_buf, 0); + gradient_compression_->Dequantize(recved, &decomp_buf, 0); merged.array += decomp_buf; } merged.request.push_back(req_meta); ApplyUpdates(key, &merged, &stored, server); } else { // async push - gc_->Dequantize(recved, &decomp_buf, 0); + gradient_compression_->Dequantize(recved, &decomp_buf, 0); exec_.Exec([this, key, &decomp_buf, &stored]() { CHECK(updater_); updater_(key, decomp_buf, &stored); @@ -546,7 +546,7 @@ class KVStoreDistServer { * starts with none, used after SetGradientCompression sets the type * currently there is no support for unsetting gradient compression */ - Gc* gc_; + GradientCompression* gradient_compression_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 2cd916050bd3..9eb3339dc24b 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -58,7 +58,7 @@ class KVStoreLocal : public KVStore { comm_ = new CommCPU(); } pinned_ctx_ = comm_->pinned_ctx(); - gc_ = std::make_shared(); + gradient_compression_ = std::make_shared(); } virtual ~KVStoreLocal() { @@ -137,7 +137,7 @@ class KVStoreLocal : public KVStore { } void SetGradientCompression(const std::string& compression_type, const float threshold) override { - gc_->SetParams(compression_type, threshold); + gradient_compression_->SetParams(compression_type, threshold); } private: @@ -149,8 +149,7 @@ class KVStoreLocal : public KVStore { local_[keys[i]] = values[i].Copy(pinned_ctx_); comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype()); } - comm_->SetGradientCompression(gc_); - gc_->set_active(true); + comm_->SetGradientCompression(gradient_compression_); } virtual void PushImpl(const std::vector& keys, diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 5a9bfc12417d..6431b42654c5 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -231,7 +231,7 @@ def check_compr_pull_before_push(kv): for k,s in [('1121', shape),('112221',irregular_shape), ('11221', big_shape), ('1122',shape)]: if k=='1122': - # tests inactive status of GC during init + # tests that GC is not used for init of a key val = mx.nd.zeros(s) kv.pull(k, val) check_diff_to_scalar(val, 1) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 2abdd5a50f8b..7bbf8b0152c9 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -21,6 +21,9 @@ sys.path.insert(0, "../../python/") import mxnet as mx import numpy as np +import numpy.random as rnd +import copy + from mxnet.test_utils import assert_almost_equal def check_diff_to_scalar(A, x, rank=None): @@ -76,7 +79,7 @@ def as_float32(s): # let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND shapes = [(4, 4), (100, 100), (2000, 2000)] -gc_inactive_key = 9 +gc_init_test_key = 9 lr = .1 nworker = 4 @@ -115,15 +118,22 @@ def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5): kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) - kv.init(gc_inactive_key, mx.nd.ones(shapes[0])) - def pull_inactive(kv): - for i in range(nrepeat): - out = [mx.nd.zeros(shapes[0], mx.gpu(g)) for g in range(nworker)] - kv.pull(gc_inactive_key, out=out) - exp = np.ones_like(out[0].asnumpy()) - for o in out: - assert_almost_equal(o.asnumpy(), exp) + kv.init(gc_init_test_key, mx.nd.ones(shapes[0])) + + # use different keys for random tests so that + # we can track residual from start + random_keys = [13, 15, 17] + for k, s in zip(random_keys, shapes): + kv.init(k, mx.nd.zeros(s)) + + def pull_init_test(kv): + # checks that compression is not applied to init of key + out = [mx.nd.zeros(shapes[0], mx.gpu(g)) for g in range(nworker)] + kv.pull(gc_init_test_key, out=out) + exp = np.ones_like(out[0].asnumpy()) + for o in out: + assert_almost_equal(o.asnumpy(), exp) def pull_before_push(kv): for i in range(nrepeat): @@ -186,34 +196,30 @@ def check_neg(kv, neg, rate, curval): # residual would be 0 again def check_compr_random(kv, threshold): - # use new keys so we can track residual - random_keys = [13, 15, 17] + mx.random.seed(123) + rnd.seed(123) for k, s in zip(random_keys, shapes): - kv.init(k, mx.nd.zeros(s)) - for j in range(len(random_keys)): - curr_residual = np.zeros(shapes[j]) - for l in range(nrepeat): - orig_val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(keys[j], out=orig_val) - - grads = [mx.nd.random_uniform(-0.6, 0.6, shape=shapes[j], ctx=mx.gpu(g)) for g in range(nworker)] - kv.push(keys[j], grads) - - val = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)] - kv.pull(keys[j], out=val) - - diffs = [val[g] - orig_val[g] for g in range(nworker)] - # compute expected by using simulation of operator - # on cpu - sum_dequantized_vals = np.zeros(shapes[j]) - for g in range(nworker): - compr, curr_residual, decompr = compute_expected_2bit_quantization( - grads[g], curr_residual, threshold) - sum_dequantized_vals += decompr*rate - - for g in range(nworker): - assert_almost_equal(diffs[g].asnumpy(), sum_dequantized_vals) - pull_inactive(kv) + curr_residual = [np.zeros(s) for g in range(nworker)] + orig_val = [mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)] + kv.pull(k, out=orig_val) + grads = [mx.nd.random_uniform(-0.6, 0.6, shape=s, ctx=mx.gpu(g)) for g in range(nworker)] + grads_cpy = copy.deepcopy(grads) + kv.push(k, grads) + val = [mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)] + kv.pull(k, out=val) + diffs = [val[g] - orig_val[g] for g in range(nworker)] + # compute expected by using simulation of operator + # on cpu + sum_dequantized_vals = np.zeros(s) + for g in range(nworker): + compr, curr_residual[g], decompr = compute_expected_2bit_quantization( + grads_cpy[g], curr_residual[g], threshold) + sum_dequantized_vals += (decompr * rate) + + for g in range(nworker): + assert_almost_equal(diffs[g].asnumpy(), sum_dequantized_vals) + + pull_init_test(kv) pull_before_push(kv) push_zeros(kv) curval = verify_residual(kv, threshold, rate) From eabc50303f730d8fdb61f84ed8641ebca5063266 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 07:59:23 -0800 Subject: [PATCH 226/237] fix importing of compute expected in test_kvstore --- tests/nightly/test_kvstore.py | 52 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 7bbf8b0152c9..95ea16ffb4b1 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -75,19 +75,6 @@ def as_float32(s): i+=32 return np.array(compr), np.array(new_residual).reshape(arr.shape), np.array(decompr).reshape(arr.shape) -keys = [3, 5, 7] -# let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND -shapes = [(4, 4), (100, 100), (2000, 2000)] - -gc_init_test_key = 9 - -lr = .1 -nworker = 4 -nrepeat = 10 - -## generate data -data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)] - ## individual key interface def test_kvstore(kv_type): print(kv_type) @@ -118,9 +105,8 @@ def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5): kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) - + # init one key with 1s so we can check if it was compressed during init kv.init(gc_init_test_key, mx.nd.ones(shapes[0])) - # use different keys for random tests so that # we can track residual from start random_keys = [13, 15, 17] @@ -196,8 +182,6 @@ def check_neg(kv, neg, rate, curval): # residual would be 0 again def check_compr_random(kv, threshold): - mx.random.seed(123) - rnd.seed(123) for k, s in zip(random_keys, shapes): curr_residual = [np.zeros(s) for g in range(nworker)] orig_val = [mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)] @@ -226,13 +210,6 @@ def check_compr_random(kv, threshold): check_neg(kv, -1*threshold, rate, curval) check_compr_random(kv, threshold) -test_kvstore('local_update_cpu') -test_kvstore('local_allreduce_cpu') -test_kvstore('local_allreduce_device') - -# compression for local kvstore happens only when reduce is on device -test_compress_kvstore('local_allreduce_device') - ## group keys interface def test_group_kvstore(kv_type): print(kv_type) @@ -253,6 +230,27 @@ def test_group_kvstore(kv_type): err = sum(err) / np.sum(np.abs(a)) assert(err < 1e-6), (err, a.shape) -test_group_kvstore('local_update_cpu') -test_group_kvstore('local_allreduce_cpu') -test_group_kvstore('local_allreduce_device') +if __name__ == "__main__": + keys = [3, 5, 7] + # let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND + shapes = [(4, 4), (100, 100), (2000, 2000)] + + gc_init_test_key = 9 + + lr = .1 + nworker = 4 + nrepeat = 10 + + ## generate data + data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)] + + test_kvstore('local_update_cpu') + test_kvstore('local_allreduce_cpu') + test_kvstore('local_allreduce_device') + + # compression for local kvstore happens only when reduce is on device + test_compress_kvstore('local_allreduce_device') + + test_group_kvstore('local_update_cpu') + test_group_kvstore('local_allreduce_cpu') + test_group_kvstore('local_allreduce_device') From 806586f41e8a2d7f3c22b92b5b382ea6b4a594b6 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 08:04:05 -0800 Subject: [PATCH 227/237] fix bug in device kvstore --- src/kvstore/comm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 98f499359405..e3e04599be55 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -504,7 +504,9 @@ class CommDevice : public Comm { const NDArray& Reduce(int key, const std::vector& src, int priority) override { - if (gc_set_) { + // when this reduce is called from kvstore_dist, gc is not set + // we don't do compression twice in dist_sync_device + if (gc_set_ && gc_->get_type() != GC_NONE) { return ReduceCompressed(key, src, priority); } From 6070450bad21bce80e5e34e6187f88a15f1601c8 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 08:21:06 -0800 Subject: [PATCH 228/237] remove active comment in pull --- src/kvstore/kvstore_dist.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index a6efc6fc2df0..9ebd65dbae3a 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -223,7 +223,6 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = recv_buf.shape().Size(); - // even if inactive should use compressed_pskv for pull when type is not none PSKV& pskv = (gradient_compression_->get_type() == GC_NONE) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); From 2289129e13616032843ec4d4131bf81dc8ca5f83 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 08:59:20 -0800 Subject: [PATCH 229/237] docstring --- python/mxnet/kvstore.py | 75 ++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index cadd2b099911..ecdf53028f3f 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -350,54 +350,53 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) def set_gradient_compression(self, compression_params=(('compression', '2bit'),)): - """ Specifies type of low-bit quantization for gradient compression if any, + """ Specifies type of low-bit quantization for gradient compression if any, \ and additional arguments depending on the type of compression being used. + 2bit Gradient Compression takes a positive float `threshold`. + The technique works by thresholding values such that positive values in the + gradient above threshold will be set to threshold. Negative values whose absolute + values are higher than threshold, will be set to the negative of threshold. + Values whose absolute values are less than threshold will be set to 0. + By doing so, each value in the gradient is in one of three states. 2bits are + used to represent these states, and every 16 float values in the original + gradient can be represented using one float. This compressed representation + can reduce communication costs. The difference between these thresholded values and + original values is stored at the sender's end as residual and added to the + gradient in the next iteration. + + When kvstore is 'local', gradient compression is used to reduce communication + between multiple devices (gpus). Gradient is quantized on each GPU which + computed the gradients, then sent to the GPU which merges the gradients. This + receiving GPU dequantizes the gradients and merges them. Note that this + increases memory usage on each GPU because of the residual array stored. + + When kvstore is 'dist', gradient compression is used to reduce communication + from worker to sender. Gradient is quantized on each worker which + computed the gradients, then sent to the server which dequantizes + this data and merges the gradients from each worker. Note that this + increases CPU memory usage on each worker because of the residual array stored. + Only worker to server communication is compressed in this setting. + If each machine has multiple GPUs, currently this GPU to GPU communication is + not compressed. Server to worker communication (in the case of pull) is also not + compressed. + + To use 2bit compression, we need to specify `compression` as `2bit`. + Only specifying `compression` would use default value for the threshold. + To completely specify the arguments for 2bit compression, we would need to pass + a dictionary which includes `threshold` like: + {'compression': '2bit', 'threshold': 0.5} + Parameters ---------- compression_params : dict - `compression_params` is a dictionary specifying the type and parameters - for gradient compression. The key `compression` in this dictionary is a + A dictionary specifying the type and parameters for gradient compression. + The key `compression` in this dictionary is a required string argument and specifies the type of gradient compression. Other keys in this dictionary are optional and specific to the type of gradient compression. Defaults to (('compression', '2bit'),). The default value is not a dict, just to avoid pylint warning on dangerous default values. - - 2bit Gradient Compression takes a positive float `threshold`. - The technique works by thresholding values such that positive values in the - gradient above threshold will be set to threshold. Negative values whose absolute - values are higher than threshold, will be set to the negative of threshold. - Values whose absolute values are less than threshold will be set to 0. - By doing so, each value in the gradient is in one of three states. 2bits are - used to represent these states, and every 16 float values in the original - gradient can be represented using one float. This compressed representation - can reduce communication costs. The difference between these thresholded values and - original values is stored at the sender's end as residual and added to the - gradient in the next iteration. - - When kvstore is 'local', gradient compression is used to reduce communication - between multiple devices (gpus). Gradient is quantized on each GPU which - computed the gradients, then sent to the GPU which merges the gradients. This - receiving GPU dequantizes the gradients and merges them. Note that this - increases memory usage on each GPU because of the residual array stored. - - When kvstore is 'dist', gradient compression is used to reduce communication - from worker to sender. Gradient is quantized on each worker which - computed the gradients, then sent to the server which dequantizes - this data and merges the gradients from each worker. Note that this - increases CPU memory usage on each worker because of the residual array stored. - Only worker to server communication is compressed in this setting. - If each machine has multiple GPUs, currently this GPU to GPU communication is - not compressed. Server to worker communication (in the case of pull) is also not - compressed. - - To use 2bit compression, we need to specify `compression` as `2bit`. - Only specifying `compression` would use default value for the threshold. - To completely specify the arguments for 2bit compression, we would need to pass - a dictionary which includes `threshold` like: - {'compression': '2bit', 'threshold': 0.5} - """ if compression_params: if not isinstance(compression_params, dict): From f41e102ed1bff78e6762930925180407d600d51d Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 14:12:54 -0800 Subject: [PATCH 230/237] use dmlc params, enums, Signed-off-by: Rahul --- example/image-classification/common/fit.py | 2 +- example/rnn/lstm_bucketing.py | 7 +-- include/mxnet/c_api.h | 9 ++-- include/mxnet/kvstore.h | 4 +- python/mxnet/gluon/trainer.py | 3 +- python/mxnet/kvstore.py | 59 +++++++++------------- python/mxnet/module/module.py | 3 +- src/c_api/c_api.cc | 15 ++++-- src/kvstore/comm.h | 4 +- src/kvstore/gradient_compression.cc | 40 ++++++++++----- src/kvstore/gradient_compression.h | 34 ++++++++++--- src/kvstore/kvstore.cc | 2 +- src/kvstore/kvstore_dist.h | 35 +++++++------ src/kvstore/kvstore_dist_server.h | 33 ++++++------ src/kvstore/kvstore_local.h | 4 +- 15 files changed, 143 insertions(+), 111 deletions(-) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py index c60772dd495b..2b002c770266 100755 --- a/example/image-classification/common/fit.py +++ b/example/image-classification/common/fit.py @@ -120,7 +120,7 @@ def fit(args, network, data_loader, **kwargs): # kvstore kv = mx.kvstore.create(args.kv_store) if args.gc_type != 'none': - kv.set_gradient_compression({'compression': args.gc_type, + kv.set_gradient_compression({'type': args.gc_type, 'threshold': args.gc_threshold}) # logging diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py index 0f5791e0e0b8..0e7f064f0078 100644 --- a/example/rnn/lstm_bucketing.py +++ b/example/rnn/lstm_bucketing.py @@ -47,10 +47,6 @@ help='the batch size.') parser.add_argument('--disp-batches', type=int, default=50, help='show progress for every n batches') -parser.add_argument('--gc-type', type=str, default='none', - help='type of gradient compression') -parser.add_argument('--gc-threshold', type=float, default=0.5, - help='threshold for 2bit gradient compression') def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): if not os.path.isfile(fname): @@ -114,8 +110,7 @@ def sym_gen(seq_len): model = mx.mod.BucketingModule( sym_gen = sym_gen, default_bucket_key = data_train.default_bucket_key, - context = contexts, - compression_params = {'compression': args.gc_type}) + context = contexts) model.fit( train_data = data_train, diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index cc821ca86221..d4dcf1d349bd 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1534,13 +1534,14 @@ MXNET_DLL int MXKVStoreCreate(const char *type, /*! * \brief Set parameters to use low-bit compressed gradients * \param handle handle to the kvstore - * \param compression type of compression - * \param threshold set the threshold for 2bit compression + * \param keys keys for compression parameters + * \param vals values for compression parameters * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle, - const char *compression, - const float threshold); + mx_uint num_params, + const char** keys, + const char** vals); /*! * \brief Delete a KVStore handle. diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index a3a4835067e9..dcb782537ef4 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -70,8 +70,7 @@ class KVStore { * \param compression_type type of compression * \param threshold threshold for 2bit compression */ - virtual void SetGradientCompression(const std::string& compression_type, - const float threshold) = 0; + virtual void SetGradientCompression(std::vector >& kwargs) = 0; /*! * \brief Initialize a list of key-value pair to the store. @@ -402,7 +401,6 @@ class KVStore { */ std::shared_ptr gradient_compression_; - /** * \brief whether to do barrier when finalize */ diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index e58e605a87fd..0c782430597c 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -110,7 +110,8 @@ def _init_kvstore(self): kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts), arg_arrays) if kvstore: - kvstore.set_gradient_compression(self._compression_params) + if self._compression_params: + kvstore.set_gradient_compression(self._compression_params) if 'dist' in kvstore.type: update_on_kvstore = False for i, param in enumerate(self._params): diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index ecdf53028f3f..528968a78f72 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -63,6 +63,16 @@ def _ctype_key_value(keys, vals): else c_array(ctypes.c_int, [keys] * len(vals)) return (c_keys, c_array(NDArrayHandle, [value.handle for value in vals]), use_str_keys) +def _ctype_dict(param_dict): + """ + Returns ctype arrays for keys and values(converted to strings) in a dictionary + """ + assert(isinstance(param_dict, dict)), \ + "unexpected type for param_dict: " + str(type(param_dict)) + c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()]) + c_vals = c_array(ctypes.c_char_p, [c_str(str(v)) for v in param_dict.values()]) + return (c_keys, c_vals) + def _updater_wrapper(updater): """A wrapper for the user-defined handle.""" def updater_handle(key, lhs_handle, rhs_handle, _): @@ -349,8 +359,8 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None): check_call(_LIB.MXKVStorePullRowSparse( self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority))) - def set_gradient_compression(self, compression_params=(('compression', '2bit'),)): - """ Specifies type of low-bit quantization for gradient compression if any, \ + def set_gradient_compression(self, compression_params): + """ Specifies type of low-bit quantization for gradient compression \ and additional arguments depending on the type of compression being used. 2bit Gradient Compression takes a positive float `threshold`. @@ -377,49 +387,30 @@ def set_gradient_compression(self, compression_params=(('compression', '2bit'),) this data and merges the gradients from each worker. Note that this increases CPU memory usage on each worker because of the residual array stored. Only worker to server communication is compressed in this setting. - If each machine has multiple GPUs, currently this GPU to GPU communication is - not compressed. Server to worker communication (in the case of pull) is also not - compressed. + If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication + is not compressed. Server to worker communication (in the case of pull) + is also not compressed. - To use 2bit compression, we need to specify `compression` as `2bit`. - Only specifying `compression` would use default value for the threshold. + To use 2bit compression, we need to specify `type` as `2bit`. + Only specifying `type` would use default value for the threshold. To completely specify the arguments for 2bit compression, we would need to pass a dictionary which includes `threshold` like: - {'compression': '2bit', 'threshold': 0.5} + {'type': '2bit', 'threshold': 0.5} Parameters ---------- compression_params : dict A dictionary specifying the type and parameters for gradient compression. - The key `compression` in this dictionary is a + The key `type` in this dictionary is a required string argument and specifies the type of gradient compression. + Currently `type` can be only `2bit` Other keys in this dictionary are optional and specific to the type - of gradient compression. Defaults to (('compression', '2bit'),). - The default value is not a dict, - just to avoid pylint warning on dangerous default values. + of gradient compression. """ - if compression_params: - if not isinstance(compression_params, dict): - raise ValueError("compression_params needs to be a dictionary") - if 'compression' not in compression_params: - raise ValueError('compression_params requires `compression` to be set') - elif not isinstance(compression_params['compression'], string_types): - raise TypeError('compression must be a string') - elif compression_params['compression'] not in ['none', '2bit']: - raise ValueError('Unsupported type of compression') - - if compression_params['compression'] == '2bit': - if 'threshold' in compression_params: - if not isinstance(compression_params['threshold'], numeric_types): - raise TypeError('threshold must be a numeric type') - if compression_params['threshold'] <= 0: - raise ValueError('threshold must be greater than 0') - else: - compression_params['threshold'] = 0.5 - - check_call(_LIB.MXKVStoreSetGradientCompression( - self.handle, c_str(compression_params['compression']), - mx_float(compression_params['threshold']))) + ckeys, cvals = _ctype_dict(compression_params) + check_call(_LIB.MXKVStoreSetGradientCompression(self.handle, + mx_uint(len(compression_params)), + ckeys, cvals)) def set_optimizer(self, optimizer): """ Registers an optimizer with the kvstore. diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 09f8a830eee6..90c3abe3b566 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -527,7 +527,8 @@ def init_optimizer(self, kvstore='local', optimizer='sgd', self._updater = None if kvstore: - kvstore.set_gradient_compression(self._compression_params) + if self._compression_params: + kvstore.set_gradient_compression(self._compression_params) # copy initialized local parameters to kvstore _initialize_kvstore(kvstore=kvstore, param_arrays=self._exec_group.param_arrays, diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 94525eb9fe8b..13c92f431559 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -733,10 +733,17 @@ int MXKVStoreCreate(const char *type, API_END(); } -int MXKVStoreSetGradientCompression(KVStoreHandle handle, - const char *compression, const float threshold) { - API_BEGIN(); - static_cast(handle)->SetGradientCompression(compression, threshold); +int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params, + const char** keys, const char** vals) { + API_BEGIN(); + std::vector > params; + for(mx_uint i = 0; i < num_params; ++i) { + std::pair p; + p.first = keys[i]; + p.second = vals[i]; + params.push_back(p); + } + static_cast(handle)->SetGradientCompression(params); API_END(); } diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index e3e04599be55..94f0d06c2047 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -86,14 +86,12 @@ class Comm { */ void SetGradientCompression(std::shared_ptr gc) { gc_ = gc; - gc_set_ = true; } protected: Context pinned_ctx_; std::shared_ptr gc_; - bool gc_set_ = false; }; /** @@ -506,7 +504,7 @@ class CommDevice : public Comm { int priority) override { // when this reduce is called from kvstore_dist, gc is not set // we don't do compression twice in dist_sync_device - if (gc_set_ && gc_->get_type() != GC_NONE) { + if ((gc_ != nullptr) && (gc_->get_type() != CompressionType::kNone)) { return ReduceCompressed(key, src, priority); } diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc index fe6d39d942f3..5919ee49761e 100644 --- a/src/kvstore/gradient_compression.cc +++ b/src/kvstore/gradient_compression.cc @@ -48,13 +48,20 @@ void split(const std::string &s, const char delim, Out result) { } } +DMLC_REGISTER_PARAMETER(GradientCompressionParam); + GradientCompression::GradientCompression() { - type_ = GC_NONE; + type_ = CompressionType::kNone; } -void GradientCompression::SetParams(const std::string &compression_type, const float threshold) { - if (compression_type == "2bit") { - SetTwoBitCompression(threshold); +void GradientCompression::SetParams(std::vector >& kwargs) { + GradientCompressionParam params; + params.InitAllowUnknown(kwargs); + CHECK_GT(params.threshold, 0) << "threshold must be greater than 0"; + if (params.type == "2bit") { + SetTwoBitCompression(params.threshold); + } else { + LOG(FATAL) << "Unknown type for gradient compression " << params.type; } } @@ -62,15 +69,20 @@ CompressionType GradientCompression::get_type() { return type_; } +std::string GradientCompression::get_type_str() { + return std::to_string(static_cast(type_)); +} + void GradientCompression::SetTwoBitCompression(const float threshold) { - type_ = GC_TWO_BIT; + type_ = CompressionType::kTwoBit; threshold_ = threshold; } std::string GradientCompression::EncodeParams() { - std::string rval = std::to_string(type_); - if (type_ == GC_TWO_BIT) { - rval += "," + std::to_string(threshold_); + using namespace std; // to reduce length of next line + string rval = get_type_str(); + if (type_ == CompressionType::kTwoBit) { + rval += "," + to_string(threshold_); } return rval; } @@ -87,10 +99,10 @@ void GradientCompression::DecodeParams(const std::string &s) { } int GradientCompression::GetCompressionFactor() { - if (type_ == GC_TWO_BIT) { + if (type_ == CompressionType::kTwoBit) { return 16; } else { - LOG(FATAL) << "Unsupported compression type: " << type_; + LOG(FATAL) << "Unsupported compression type: " << get_type_str(); return 0; } } @@ -110,7 +122,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t const int a = from.ctx().dev_mask(); const int b = to->ctx().dev_mask(); const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { + if (type_ == CompressionType::kTwoBit) { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), residual->data(), to->data()}; @@ -135,7 +147,7 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t #endif } } else { - LOG(FATAL) << "Unsupported quantization of type " << type_; + LOG(FATAL) << "Unsupported quantization of type " << get_type_str(); } } @@ -145,7 +157,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray const int a = from.ctx().dev_mask(); const int b = to->ctx().dev_mask(); const float threshold = threshold_; - if (type_ == GC_TWO_BIT) { + if (type_ == CompressionType::kTwoBit) { if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) { mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) { std::vector inputs = {from.data(), to->data()}; @@ -170,7 +182,7 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray #endif } } else { - LOG(FATAL) << "Unsupported dequantization of type " << type_; + LOG(FATAL) << "Unsupported dequantization of type " << get_type_str(); } } diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h index 7116b0e1a386..895ebc6c7211 100644 --- a/src/kvstore/gradient_compression.h +++ b/src/kvstore/gradient_compression.h @@ -26,13 +26,25 @@ #ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ #define MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ #include -#include"mxnet/ndarray.h" +#include +#include namespace mxnet { namespace kvstore { -enum CompressionType { - GC_NONE, GC_TWO_BIT +enum class CompressionType { + kNone, kTwoBit +}; + +struct GradientCompressionParam : public dmlc::Parameter { + std::string type; + float threshold; + DMLC_DECLARE_PARAMETER(GradientCompressionParam) { + DMLC_DECLARE_FIELD(type) + .describe("Type of gradient compression to use, like `2bit` for example"); + DMLC_DECLARE_FIELD(threshold).set_default(0.5) + .describe("Threshold to use for 2bit gradient compression"); + } }; class GradientCompression { @@ -43,16 +55,21 @@ class GradientCompression { /*! * \brief sets parameters for gradient compression - * \param compression_type str representing types like 2bit - * \param threshold float value used for thresholding gradients + * \param kwargs a vector of pair of strings. A pair represents key and value + * of the parameter. Will be parsed by GradientCompressionParam */ - void SetParams(const std::string &compression_type, const float threshold); + void SetParams(std::vector >& kwargs); /*! * \brief returns type of compression if any */ CompressionType get_type(); + /*! + * \brief returns as string the enum value of compression type + */ + std::string get_type_str(); + /*! * \brief sets two bit gradient compression * \param threshold float value used for thresholding gradients @@ -113,6 +130,11 @@ class GradientCompression { * all negative gradients will be thresholded to -1*`threshold_` */ float threshold_ = 0; + + /*! + * \brief parameters for gradient compression are sent in this form to backend + */ + GradientCompressionParam params; }; } // namespace kvstore } // namespace mxnet diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc index a288676102cb..059961e1781f 100644 --- a/src/kvstore/kvstore.cc +++ b/src/kvstore/kvstore.cc @@ -48,7 +48,7 @@ KVStore* KVStore::Create(const char *type_name) { kv = new kvstore::KVStoreDist(use_device_comm); if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) { // configure the server to be the sync mode - kv->SendCommandToServers(kvstore::kSyncMode, ""); + kv->SendCommandToServers(static_cast(kvstore::CommandType::kSyncMode), ""); } #else LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use " << tname; diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 9ebd65dbae3a..960f68ca7102 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -68,7 +68,7 @@ class KVStoreDist : public KVStoreLocal { Barrier(); if (get_rank() == 0) { // stop the executor at servers - SendCommandToServers(kStopServer, ""); + SendCommandToServers(static_cast(CommandType::kStopServer), ""); } } ps::Finalize(barrier_before_exit_); @@ -85,11 +85,11 @@ class KVStoreDist : public KVStoreLocal { } } - void SetGradientCompression(const std::string& compression_type, const float threshold) override { - KVStoreLocal::SetGradientCompression(compression_type, threshold); + void SetGradientCompression(std::vector >& kwargs) override { + KVStoreLocal::SetGradientCompression(kwargs); if (get_rank() == 0) { - // only rank 0 because init happens by rank 0 only - SendCommandToServers(kSetGradientCompression, gradient_compression_->EncodeParams()); + SendCommandToServers(static_cast(CommandType::kSetGradientCompression), + gradient_compression_->EncodeParams()); } } @@ -223,7 +223,7 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = recv_buf.shape().Size(); - PSKV& pskv = (gradient_compression_->get_type() == GC_NONE) ? + PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); #if MKL_EXPERIMENTAL == 1 @@ -233,7 +233,9 @@ class KVStoreDist : public KVStoreLocal { // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); // issue pull - int cmd = (gradient_compression_->get_type() != GC_NONE) ? kCompressedPushPull : kDefaultPushPull; + int cmd = (gradient_compression_->get_type() != CompressionType::kNone) ? + static_cast(DataHandleType::kCompressedPushPull) : + static_cast(DataHandleType::kDefaultPushPull); CHECK_NOTNULL(ps_worker_)->ZPull( pskv.keys, vals, &pskv.lens, cmd, [vals, cb](){ delete vals; cb(); }); }; @@ -331,7 +333,7 @@ class KVStoreDist : public KVStoreLocal { // push to servers if (storage_type == kDefaultStorage) { - if (gradient_compression_->get_type() == GC_NONE) { + if (gradient_compression_->get_type() == CompressionType::kNone) { PSKV& pskv = EncodeDefaultKey(key, comm_buf.shape().Size(), true); PushDefault(key, comm_buf, pskv, priority); } else { @@ -350,7 +352,7 @@ class KVStoreDist : public KVStoreLocal { } } } else if (storage_type == kRowSparseStorage) { - CHECK_EQ(gradient_compression_->get_type(), GC_NONE) + CHECK(gradient_compression_->get_type() == CompressionType::kNone) << "Gradient compression for row sparse storage type is not supported"; PushRowSparse(key, comm_buf, priority); } else { @@ -382,7 +384,8 @@ class KVStoreDist : public KVStoreLocal { // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kCompressedPushPull, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, + static_cast(DataHandleType::kCompressedPushPull), [cb]() { cb(); }); }; // acquire locks on both comm_buf and small_buf so that // pull (which uses comm_buf) for the same key waits till push finishes @@ -408,7 +411,8 @@ class KVStoreDist : public KVStoreLocal { // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( - pskv.keys, vals, pskv.lens, kDefaultPushPull, [cb]() { cb(); }); + pskv.keys, vals, pskv.lens, + static_cast(DataHandleType::kDefaultPushPull), [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, @@ -442,9 +446,9 @@ class KVStoreDist : public KVStoreLocal { << pskv.keys << " size: " << size; } ps::SArray vals(data, size, false); - CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens, kRowSparsePushPull, [cb]() { - cb(); - }); + CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens, + static_cast(DataHandleType::kRowSparsePushPull), + [cb]() { cb(); }); }; Engine::Get()->PushAsync( push_to_servers, @@ -486,7 +490,8 @@ class KVStoreDist : public KVStoreLocal { // at this point, later functions may access the indices variable while copy happens mshadow::Copy(recv_buf.aux_data(kIdx).FlatTo1D(), indices.data().FlatTo1D()); - CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, kRowSparsePushPull, + CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens, + static_cast(DataHandleType::kRowSparsePushPull), [vals, cb]() { delete vals; cb(); }); }; CHECK_NOTNULL(Engine::Get())->PushAsync( diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index b8bd1df55ce9..97d80999bb5f 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -39,12 +39,13 @@ namespace mxnet { namespace kvstore { -static const int kRowSparsePushPull = 1; -static const int kDefaultPushPull = 0; -static const int kCompressedPushPull = 3; -static const int kStopServer = -1; -static const int kSyncMode = -2; -static const int kSetGradientCompression = 2; +enum class CommandType { + kStopServer, kSyncMode, kSetGradientCompression +}; + +enum class DataHandleType { + kDefaultPushPull, kCompressedPushPull, kRowSparsePushPull +}; /** * \brief executor runs a function using the thread called \ref Start @@ -118,13 +119,12 @@ class KVStoreDistServer { ps_server_->set_request_handle( std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3)); sync_mode_ = false; - gradient_compression_ = new GradientCompression(); + gradient_compression_ = std::make_shared(); log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false); } ~KVStoreDistServer() { delete ps_server_; - delete gradient_compression_; } void set_controller(const KVStore::Controller& controller) { @@ -151,11 +151,12 @@ class KVStoreDistServer { }; void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) { - if (recved.head == kStopServer) { + CommandType recved_type = static_cast(recved.head); + if (recved_type == CommandType::kStopServer) { exec_.Stop(); - } else if (recved.head == kSyncMode) { + } else if (recved_type == CommandType::kSyncMode) { sync_mode_ = true; - } else if (recved.head == kSetGradientCompression) { + } else if (recved_type == CommandType::kSetGradientCompression) { gradient_compression_->DecodeParams(recved.body); } else { // let the main thread to execute ctrl, which is necessary for python @@ -170,9 +171,10 @@ class KVStoreDistServer { void DataHandleEx(const ps::KVMeta& req_meta, const ps::KVPairs& req_data, ps::KVServer* server) { - if (req_meta.cmd == kRowSparsePushPull) { + DataHandleType recved_type = static_cast(req_meta.cmd); + if (recved_type == DataHandleType::kRowSparsePushPull) { DataHandleRowSparse(req_meta, req_data, server); - } else if (req_meta.cmd == kCompressedPushPull) { + } else if (recved_type == DataHandleType::kCompressedPushPull) { DataHandleCompressed(req_meta, req_data, server); } else { DataHandleDefault(req_meta, req_data, server); @@ -383,7 +385,6 @@ class KVStoreDistServer { void DataHandleCompressed(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - CHECK_EQ(req_meta.cmd, kCompressedPushPull); if (req_meta.push) { // there used several WaitToRead, this is because \a recved's memory // could be deallocated when this function returns. so we need to make sure @@ -451,7 +452,7 @@ class KVStoreDistServer { void DataHandleDefault(const ps::KVMeta& req_meta, const ps::KVPairs &req_data, ps::KVServer* server) { - CHECK_EQ(req_meta.cmd, kDefaultPushPull); + CHECK_EQ(req_meta.cmd, static_cast(DataHandleType::kDefaultPushPull)); // do some check CHECK_EQ(req_data.keys.size(), (size_t)1); if (req_meta.push) { @@ -546,7 +547,7 @@ class KVStoreDistServer { * starts with none, used after SetGradientCompression sets the type * currently there is no support for unsetting gradient compression */ - GradientCompression* gradient_compression_; + std::shared_ptr gradient_compression_; }; } // namespace kvstore diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 9eb3339dc24b..d9545b4191da 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -136,8 +136,8 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } - void SetGradientCompression(const std::string& compression_type, const float threshold) override { - gradient_compression_->SetParams(compression_type, threshold); + void SetGradientCompression(std::vector >& kwargs) override { + gradient_compression_->SetParams(kwargs); } private: From 5acbc9a8d78564151be3faccad00a6f7a38696e1 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 14:23:14 -0800 Subject: [PATCH 231/237] doc updates Signed-off-by: Rahul --- python/mxnet/gluon/trainer.py | 2 +- python/mxnet/kvstore.py | 2 +- python/mxnet/module/bucketing_module.py | 2 +- python/mxnet/module/module.py | 2 +- src/kvstore/gradient_compression-inl.h | 2 +- src/kvstore/gradient_compression.h | 5 ----- tools/bandwidth/measure.py | 3 ++- 7 files changed, 7 insertions(+), 11 deletions(-) diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py index 0c782430597c..f3a14609587f 100644 --- a/python/mxnet/gluon/trainer.py +++ b/python/mxnet/gluon/trainer.py @@ -47,7 +47,7 @@ class Trainer(object): compression_params : dict Specifies type of gradient compression and additional arguments depending on the type of compression being used. For example, 2bit compression requires a threshold. - Arguments would then be {'compression':'2bit', 'threshold':0.5} + Arguments would then be {'type':'2bit', 'threshold':0.5} See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. Properties diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py index 528968a78f72..55243e0aa11c 100644 --- a/python/mxnet/kvstore.py +++ b/python/mxnet/kvstore.py @@ -24,7 +24,7 @@ from .ndarray import NDArray from .ndarray import _ndarray_cls from .base import _LIB -from .base import check_call, c_array, c_str, string_types, numeric_types, mx_uint, mx_float, py_str +from .base import check_call, c_array, c_str, string_types, mx_uint, py_str from .base import NDArrayHandle, KVStoreHandle from . import optimizer as opt diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py index 5ebc455f6a29..9b90eeb9427e 100644 --- a/python/mxnet/module/bucketing_module.py +++ b/python/mxnet/module/bucketing_module.py @@ -55,7 +55,7 @@ class BucketingModule(BaseModule): compression_params : dict Specifies type of gradient compression and additional arguments depending on the type of compression being used. For example, 2bit compression requires a threshold. - Arguments would then be {'compression':'2bit', 'threshold':0.5} + Arguments would then be {'type':'2bit', 'threshold':0.5} See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. """ def __init__(self, sym_gen, default_bucket_key=None, logger=logging, diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py index 90c3abe3b566..dd59e09da057 100644 --- a/python/mxnet/module/module.py +++ b/python/mxnet/module/module.py @@ -62,7 +62,7 @@ class Module(BaseModule): compression_params : dict Specifies type of gradient compression and additional arguments depending on the type of compression being used. For example, 2bit compression requires a threshold. - Arguments would then be {'compression':'2bit', 'threshold':0.5} + Arguments would then be {'type':'2bit', 'threshold':0.5} See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. """ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), diff --git a/src/kvstore/gradient_compression-inl.h b/src/kvstore/gradient_compression-inl.h index 3e182232f47e..9b69bd11472c 100644 --- a/src/kvstore/gradient_compression-inl.h +++ b/src/kvstore/gradient_compression-inl.h @@ -31,7 +31,7 @@ namespace mxnet { namespace kvstore { -// these gpu functions are defined in gc.cu +// these gpu functions are defined in gradient_compression.cu void Quantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, const float threshold); void Dequantize2BitImpl(mshadow::Stream *s, const std::vector &inputs, diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h index 895ebc6c7211..872e27f8d979 100644 --- a/src/kvstore/gradient_compression.h +++ b/src/kvstore/gradient_compression.h @@ -130,11 +130,6 @@ class GradientCompression { * all negative gradients will be thresholded to -1*`threshold_` */ float threshold_ = 0; - - /*! - * \brief parameters for gradient compression are sent in this form to backend - */ - GradientCompressionParam params; }; } // namespace kvstore } // namespace mxnet diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py index cc8379dfc0b6..cd4f0fe8433c 100644 --- a/tools/bandwidth/measure.py +++ b/tools/bandwidth/measure.py @@ -78,7 +78,8 @@ def run(network, optimizer, gpus, kv_store, image_shape, disp_batches, # create kvstore and optimizer devs = [mx.gpu(int(i)) for i in gpus.split(',')] kv = mx.kv.create(kv_store) - kv.set_gradient_compression({'compression': gc_type}) + if gc_type != 'none': + kv.set_gradient_compression({'type': gc_type}) if optimizer is None or optimizer == 'None': opt = None else: From 3c1bacb5a32cd5175599026142f8ce684944892d Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 14:45:40 -0800 Subject: [PATCH 232/237] lint Signed-off-by: Rahul --- include/mxnet/kvstore.h | 3 ++- src/c_api/c_api.cc | 2 +- src/kvstore/gradient_compression.cc | 8 +++++--- src/kvstore/gradient_compression.h | 8 +++++--- src/kvstore/kvstore_dist.h | 3 ++- src/kvstore/kvstore_local.h | 3 ++- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h index dcb782537ef4..6957876b6c42 100644 --- a/include/mxnet/kvstore.h +++ b/include/mxnet/kvstore.h @@ -70,7 +70,8 @@ class KVStore { * \param compression_type type of compression * \param threshold threshold for 2bit compression */ - virtual void SetGradientCompression(std::vector >& kwargs) = 0; + virtual void SetGradientCompression(const std::vector > + & kwargs) = 0; /*! * \brief Initialize a list of key-value pair to the store. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 13c92f431559..b0faa54d4409 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -737,7 +737,7 @@ int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params, const char** keys, const char** vals) { API_BEGIN(); std::vector > params; - for(mx_uint i = 0; i < num_params; ++i) { + for (mx_uint i = 0; i < num_params; ++i) { std::pair p; p.first = keys[i]; p.second = vals[i]; diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc index 5919ee49761e..b8c626cd53a8 100644 --- a/src/kvstore/gradient_compression.cc +++ b/src/kvstore/gradient_compression.cc @@ -54,7 +54,8 @@ GradientCompression::GradientCompression() { type_ = CompressionType::kNone; } -void GradientCompression::SetParams(std::vector >& kwargs) { +void GradientCompression::SetParams(const std::vector > + & kwargs) { GradientCompressionParam params; params.InitAllowUnknown(kwargs); CHECK_GT(params.threshold, 0) << "threshold must be greater than 0"; @@ -79,7 +80,7 @@ void GradientCompression::SetTwoBitCompression(const float threshold) { } std::string GradientCompression::EncodeParams() { - using namespace std; // to reduce length of next line + using namespace std; // to reduce length of next line string rval = get_type_str(); if (type_ == CompressionType::kTwoBit) { rval += "," + to_string(threshold_); @@ -151,7 +152,8 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t } } -void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority) { +void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, + const int priority) { CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape"; CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape"; const int a = from.ctx().dev_mask(); diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h index 872e27f8d979..f40b45f5a513 100644 --- a/src/kvstore/gradient_compression.h +++ b/src/kvstore/gradient_compression.h @@ -25,9 +25,11 @@ #ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ #define MXNET_KVSTORE_GRADIENT_COMPRESSION_H_ -#include -#include #include +#include +#include +#include +#include "mxnet/ndarray.h" namespace mxnet { namespace kvstore { @@ -58,7 +60,7 @@ class GradientCompression { * \param kwargs a vector of pair of strings. A pair represents key and value * of the parameter. Will be parsed by GradientCompressionParam */ - void SetParams(std::vector >& kwargs); + void SetParams(const std::vector >& kwargs); /*! * \brief returns type of compression if any diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index 960f68ca7102..002d30d7161c 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -85,7 +85,8 @@ class KVStoreDist : public KVStoreLocal { } } - void SetGradientCompression(std::vector >& kwargs) override { + void SetGradientCompression(const std::vector > + & kwargs) override { KVStoreLocal::SetGradientCompression(kwargs); if (get_rank() == 0) { SendCommandToServers(static_cast(CommandType::kSetGradientCompression), diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index d9545b4191da..49c260231452 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -136,7 +136,8 @@ class KVStoreLocal : public KVStore { PullRowSparseImpl(keys, val_rowids, priority); } - void SetGradientCompression(std::vector >& kwargs) override { + void SetGradientCompression(const std::vector > + & kwargs) override { gradient_compression_->SetParams(kwargs); } From dfe7a7dc27d0902af6c4b3891a3d581fb1821b25 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 15:32:46 -0800 Subject: [PATCH 233/237] typo Signed-off-by: Rahul --- python/mxnet/module/bucketing_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py index 1c2cc04e944a..4a5330ea2c5a 100644 --- a/python/mxnet/module/bucketing_module.py +++ b/python/mxnet/module/bucketing_module.py @@ -330,7 +330,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True, context=self._context, work_load_list=self._work_load_list, fixed_param_names=self._fixed_param_names, state_names=self._state_names, - group2ctxs=self._group2ctxs + group2ctxs=self._group2ctxs, compression_params=self._compression_params) module.bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind=False, shared_module=None, grad_req=grad_req) From 4b6f34a4a9e22fea109465a449e56e051bb49910 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 15:37:16 -0800 Subject: [PATCH 234/237] rename field to type Signed-off-by: Rahul --- tests/nightly/dist_sync_kvstore.py | 2 +- tests/nightly/test_kvstore.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 6431b42654c5..5ed4b2ab765f 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -61,7 +61,7 @@ def init_kv(): def init_kv_compressed(kv): threshold = 0.5 - kv.set_gradient_compression({'compression': '2bit', 'threshold':threshold}) + kv.set_gradient_compression({'type': '2bit', 'threshold':threshold}) # init kv compression keys kv.init('11221', mx.nd.zeros(big_shape)) kv.init('112221', mx.nd.zeros(irregular_shape)) diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py index 95ea16ffb4b1..a14feac7a3aa 100644 --- a/tests/nightly/test_kvstore.py +++ b/tests/nightly/test_kvstore.py @@ -101,7 +101,7 @@ def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5): print(kv_type + ' with ' + compression + ' compression') rate = 2 kv = mx.kv.create(kv_type) - kv.set_gradient_compression({'compression':compression, 'threshold':threshold}) + kv.set_gradient_compression({'type':compression, 'threshold':threshold}) kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate)) for k, s in zip(keys, shapes): kv.init(k, mx.nd.zeros(s)) From 30a197b40aefdf1098e1430dabccfd8101e65b48 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 16 Nov 2017 18:03:00 -0800 Subject: [PATCH 235/237] fix distributed kvstore stopping issue. frontend was sending command with id=stopServer in old enum Signed-off-by: Rahul --- src/kvstore/kvstore_dist_server.h | 3 ++- tests/nightly/dist_sync_kvstore.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h index 97d80999bb5f..0afb5ddee0d8 100644 --- a/src/kvstore/kvstore_dist_server.h +++ b/src/kvstore/kvstore_dist_server.h @@ -40,7 +40,7 @@ namespace mxnet { namespace kvstore { enum class CommandType { - kStopServer, kSyncMode, kSetGradientCompression + kController, kStopServer, kSyncMode, kSetGradientCompression }; enum class DataHandleType { @@ -159,6 +159,7 @@ class KVStoreDistServer { } else if (recved_type == CommandType::kSetGradientCompression) { gradient_compression_->DecodeParams(recved.body); } else { + // this uses value 0 for message id from frontend // let the main thread to execute ctrl, which is necessary for python exec_.Exec([this, recved]() { CHECK(controller_); diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py index 5ed4b2ab765f..df85fe586054 100644 --- a/tests/nightly/dist_sync_kvstore.py +++ b/tests/nightly/dist_sync_kvstore.py @@ -278,7 +278,7 @@ def check_compr_random(kv, threshold, nworker): decompr *= nworker * rate assert_almost_equal(diff.asnumpy(), decompr) - print ('worker '+str(my_rank)+' started') + print ('worker '+str(my_rank)+' started with non compression tests') check_default_keys(kv, my_rank, nworker) check_row_sparse_keys(kv, my_rank, nworker) check_row_sparse_keys_with_zeros(kv, my_rank, nworker) @@ -286,6 +286,7 @@ def check_compr_random(kv, threshold, nworker): print('worker ' + str(my_rank) + ' is done with non compression tests') # don't run non compressed keys after this as kvstore now is set to compressed + print ('worker '+str(my_rank)+' started with compression tests') kv, threshold = init_kv_compressed(kv) check_compr_pull_before_push(kv) check_compr_zero(kv) From 3073bf7ce5f1c0b63a32a85fc603c5e3e382cb6e Mon Sep 17 00:00:00 2001 From: Rahul Date: Fri, 17 Nov 2017 08:34:27 -0800 Subject: [PATCH 236/237] Trigger CI From d5e4b2ef4a6276ae8868b58418a8826ce9cae5b0 Mon Sep 17 00:00:00 2001 From: Rahul Date: Fri, 17 Nov 2017 10:47:40 -0800 Subject: [PATCH 237/237] trigger CI