apache · cjolivier01 · Nov 19, 2017 · Aug 29, 2017 · Aug 29, 2017 · Aug 29, 2017
@@ -103,6 +103,11 @@ def add_fit_args(parser):
                        help='1 means test reading speed without training')
     train.add_argument('--dtype', type=str, default='float32',
                        help='precision: float32 or float16')
+    train.add_argument('--gc-type', type=str, default='none',
+                       help='type of gradient compression to use, \
+                             takes `2bit` or `none` for now')
+    train.add_argument('--gc-threshold', type=float, default=0.5,
+                       help='threshold for 2bit gradient compression')
     return train
 
 def fit(args, network, data_loader, **kwargs):
@@ -114,6 +119,9 @@ def fit(args, network, data_loader, **kwargs):
     """
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
+    if args.gc_type != 'none':
+        kv.set_gradient_compression({'type': args.gc_type,
+                                     'threshold': args.gc_threshold})
 
     # logging
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
@@ -162,10 +170,10 @@ def fit(args, network, data_loader, **kwargs):
 
     lr_scheduler  = lr_scheduler
     optimizer_params = {
-            'learning_rate': lr,
-            'wd' : args.wd,
-            'lr_scheduler': lr_scheduler,
-            'multi_precision': True}
+        'learning_rate': lr,
+        'wd' : args.wd,
+        'lr_scheduler': lr_scheduler,
+        'multi_precision': True}
 
     # Only a limited number of optimizers have 'momentum' property
     has_momentum = {'sgd', 'dcasgd', 'nag'}
@@ -195,17 +203,17 @@ def fit(args, network, data_loader, **kwargs):
 
     # run
     model.fit(train,
-        begin_epoch        = args.load_epoch if args.load_epoch else 0,
-        num_epoch          = args.num_epochs,
-        eval_data          = val,
-        eval_metric        = eval_metrics,
-        kvstore            = kv,
-        optimizer          = args.optimizer,
-        optimizer_params   = optimizer_params,
-        initializer        = initializer,
-        arg_params         = arg_params,
-        aux_params         = aux_params,
-        batch_end_callback = batch_end_callbacks,
-        epoch_end_callback = checkpoint,
-        allow_missing      = True,
-        monitor            = monitor)
+              begin_epoch        = args.load_epoch if args.load_epoch else 0,
+              num_epoch          = args.num_epochs,
+              eval_data          = val,
+              eval_metric        = eval_metrics,
+              kvstore            = kv,
+              optimizer          = args.optimizer,
+              optimizer_params   = optimizer_params,
+              initializer        = initializer,
+              arg_params         = arg_params,
+              aux_params         = aux_params,
+              batch_end_callback = batch_end_callbacks,
+              epoch_end_callback = checkpoint,
+              allow_missing      = True,
+              monitor            = monitor)
@@ -48,7 +48,6 @@
 parser.add_argument('--disp-batches', type=int, default=50,
                     help='show progress for every n batches')
 
-
 def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     if not os.path.isfile(fname):
         raise IOError("Please use get_ptb_data.sh to download requied file (data/ptb.train.txt)")

@@ -1530,6 +1530,19 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars,
  */
 MXNET_DLL int MXKVStoreCreate(const char *type,
                               KVStoreHandle *out);
+
+/*!
+ * \brief Set parameters to use low-bit compressed gradients
+ * \param handle handle to the kvstore
+ * \param keys keys for compression parameters
+ * \param vals values for compression parameters
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle,
+                                              mx_uint num_params,
+                                              const char** keys,
+                                              const char** vals);
+
 /*!
  * \brief Delete a KVStore handle.
  * \param handle handle to the kvstore

@@ -30,6 +30,7 @@
 #include <string>
 #include <functional>
 #include <atomic>
+#include "../../src/kvstore/gradient_compression.h"
 #include "./ndarray.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "ps/ps.h"
@@ -64,6 +65,13 @@ class KVStore {
    */
   inline const std::string& type() { return type_; }
 
+  /**
+   * \brief Set parameters to use low-bit compressed gradients
+   * \param compression_type type of compression
+   * \param threshold threshold for 2bit compression
+   */
+  virtual void SetGradientCompression(std::vector<std::pair<std::string, std::string> >& kwargs) = 0;
+
   /*!
    * \brief Initialize a list of key-value pair to the store.
    *
@@ -387,6 +395,12 @@ class KVStore {
    */
   std::string type_;
 
+  /** \brief Gradient compression object starts with GC_NONE mode
+   * Used if SetGradientCompression sets the type.
+   * Currently there is no support for un-setting gradient compression
+   */
+  std::shared_ptr<kvstore::GradientCompression> gradient_compression_;
+
   /**
    * \brief whether to do barrier when finalize
    */

@@ -44,14 +44,20 @@ class Trainer(object):
     kvstore : str or KVStore
         kvstore type for multi-gpu and distributed training. See help on
         :any:`mxnet.kvstore.create` for more information.
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
 
     Properties
     ----------
     learning_rate: float
         The current learning rate of the optimizer. Given an Optimizer object
         optimizer, its learning rate can be accessed as optimizer.learning_rate.
     """
-    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
+    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
+                 compression_params=None):
         if isinstance(params, (dict, ParameterDict)):
             params = list(params.values())
         if not isinstance(params, (list, tuple)):
@@ -65,7 +71,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
                     "First argument must be a list or dict of Parameters, " \
                     "got list of %s."%(type(param)))
             self._params.append(param)
-
+        self._compression_params = compression_params
         optimizer_params = optimizer_params if optimizer_params else {}
         self._scale = optimizer_params.get('rescale_grad', 1.0)
         self._contexts = self._check_contexts()
@@ -104,6 +110,8 @@ def _init_kvstore(self):
         kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
                                                      arg_arrays)
         if kvstore:
+            if self._compression_params:
+                kvstore.set_gradient_compression(self._compression_params)
             if 'dist' in kvstore.type:
                 update_on_kvstore = False
             for i, param in enumerate(self._params):

@@ -63,6 +63,16 @@ def _ctype_key_value(keys, vals):
                  else c_array(ctypes.c_int, [keys] * len(vals))
         return (c_keys, c_array(NDArrayHandle, [value.handle for value in vals]), use_str_keys)
 
+def _ctype_dict(param_dict):
+    """
+    Returns ctype arrays for keys and values(converted to strings) in a dictionary
+    """
+    assert(isinstance(param_dict, dict)), \
+        "unexpected type for param_dict: " + str(type(param_dict))
+    c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
+    c_vals = c_array(ctypes.c_char_p, [c_str(str(v)) for v in param_dict.values()])
+    return (c_keys, c_vals)
+
 def _updater_wrapper(updater):
     """A wrapper for the user-defined handle."""
     def updater_handle(key, lhs_handle, rhs_handle, _):
@@ -349,6 +359,58 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
             check_call(_LIB.MXKVStorePullRowSparse(
                 self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))
 
+    def set_gradient_compression(self, compression_params):
+        """ Specifies type of low-bit quantization for gradient compression \
+         and additional arguments depending on the type of compression being used.
+
+        2bit Gradient Compression takes a positive float `threshold`.
+        The technique works by thresholding values such that positive values in the
+        gradient above threshold will be set to threshold. Negative values whose absolute
+        values are higher than threshold, will be set to the negative of threshold.
+        Values whose absolute values are less than threshold will be set to 0.
+        By doing so, each value in the gradient is in one of three states. 2bits are
+        used to represent these states, and every 16 float values in the original
+        gradient can be represented using one float. This compressed representation
+        can reduce communication costs. The difference between these thresholded values and
+        original values is stored at the sender's end as residual and added to the
+        gradient in the next iteration.
+
+        When kvstore is 'local', gradient compression is used to reduce communication
+        between multiple devices (gpus). Gradient is quantized on each GPU which
+        computed the gradients, then sent to the GPU which merges the gradients. This
+        receiving GPU dequantizes the gradients and merges them. Note that this
+        increases memory usage on each GPU because of the residual array stored.
+
+        When kvstore is 'dist', gradient compression is used to reduce communication
+        from worker to sender. Gradient is quantized on each worker which
+        computed the gradients, then sent to the server which dequantizes
+        this data and merges the gradients from each worker. Note that this
+        increases CPU memory usage on each worker because of the residual array stored.
+        Only worker to server communication is compressed in this setting.
+        If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication
+        is not compressed. Server to worker communication (in the case of pull)
+        is also not compressed.
+
+        To use 2bit compression, we need to specify `type` as `2bit`.
+        Only specifying `type` would use default value for the threshold.
+        To completely specify the arguments for 2bit compression, we would need to pass
+        a dictionary which includes `threshold` like:
+        {'type': '2bit', 'threshold': 0.5}
+
+        Parameters
+        ----------
+        compression_params : dict
+            A dictionary specifying the type and parameters for gradient compression.
+            The key `type` in this dictionary is a
+            required string argument and specifies the type of gradient compression.
+            Currently `type` can be only `2bit`
+            Other keys in this dictionary are optional and specific to the type
+            of gradient compression.
+        """
+        ckeys, cvals = _ctype_dict(compression_params)
+        check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
+                                                        mx_uint(len(compression_params)),
+                                                        ckeys, cvals))
 
     def set_optimizer(self, optimizer):
         """ Registers an optimizer with the kvstore.

@@ -52,10 +52,16 @@ class BucketingModule(BaseModule):
     state_names : list of str
         States are similar to data and label, but not provided by data iterator.
         Instead they are initialized to 0 and can be set by set_states()
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
                  context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None):
+                 fixed_param_names=None, state_names=None,
+                 compression_params=None):
         super(BucketingModule, self).__init__(logger=logger)
 
         assert default_bucket_key is not None
@@ -73,6 +79,7 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
         _check_input_names(symbol, state_names, "state", True)
         _check_input_names(symbol, fixed_param_names, "fixed_param", True)
 
+        self._compression_params = compression_params
         self._fixed_param_names = fixed_param_names
         self._state_names = state_names
         self._context = context
@@ -319,7 +326,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         module = Module(symbol, data_names, label_names, logger=self.logger,
                         context=self._context, work_load_list=self._work_load_list,
                         fixed_param_names=self._fixed_param_names,
-                        state_names=self._state_names)
+                        state_names=self._state_names,
+                        compression_params=self._compression_params)
         module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
                     force_rebind=False, shared_module=None, grad_req=grad_req)
         self._curr_module = module
@@ -349,7 +357,8 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
                             logger=self.logger, context=self._context,
                             work_load_list=self._work_load_list,
                             fixed_param_names=self._fixed_param_names,
-                            state_names=self._state_names)
+                            state_names=self._state_names,
+                            compression_params=self._compression_params)
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
                         force_rebind=False, shared_module=self._buckets[self._default_bucket_key])

@@ -59,10 +59,15 @@ class Module(BaseModule):
     state_names : list of str
         states are similar to data and label, but not provided by data iterator.
         Instead they are initialized to 0 and can be set by `set_states()`.
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
                  logger=logging, context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None):
+                 fixed_param_names=None, state_names=None, compression_params=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -99,6 +104,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         self._aux_params = None
         self._params_dirty = False
 
+        self._compression_params = compression_params
         self._optimizer = None
         self._kvstore = None
         self._update_on_kvstore = None
@@ -521,6 +527,8 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
         self._updater = None
 
         if kvstore:
+            if self._compression_params:
+                kvstore.set_gradient_compression(self._compression_params)
             # copy initialized local parameters to kvstore
             _initialize_kvstore(kvstore=kvstore,
                                 param_arrays=self._exec_group.param_arrays,

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -733,6 +733,20 @@ int MXKVStoreCreate(const char *type,
   API_END();
 }
 
+int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params,
+                                    const char** keys, const char** vals) {
+  API_BEGIN();
+  std::vector<std::pair<std::string, std::string> > params;
+  for(mx_uint i = 0; i < num_params; ++i) {
+    std::pair<std::string, std::string> p;
+    p.first = keys[i];
+    p.second = vals[i];
+    params.push_back(p);
+  }
+  static_cast<KVStore*>(handle)->SetGradientCompression(params);
+  API_END();
+}
+
 int MXKVStoreFree(KVStoreHandle handle) {
   API_BEGIN();
   delete static_cast<KVStore*>(handle);