src/operator/quantization/quantized_fully_connected.cu

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file quantized_fully_connected.cu
 * \brief
 * \author Ziheng Jiang, Jun Wu
 */
#include "./quantization_utils.h"
#include "../mxnet_op.h"
#include "../nn/fully_connected-inl.h"

namespace mxnet {
namespace op {

#if CUDA_VERSION >= 8000
// value + bias_value * (range1 / limit_range1) * (limit_range2 / range2)
struct QuantizedBiasAddKernel {
  MSHADOW_XINLINE static void Map(int i,
                                  size_t k,
                                  int32_t* out,
                                  const int8_t* bias,
                                  const float* min_out,
                                  const float* max_out,
                                  const float* min_bias,
                                  const float* max_bias) {
    typedef int32_t T1;
    typedef int8_t T2;
    using mshadow::red::limits::MaxValue;
    using mshadow::red::limits::MinValue;
    float float_for_one_out_quant =
        MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
    float float_for_one_bias_quant =
        MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
    out[i] = (out[i] * float_for_one_out_quant + bias[i % k] * float_for_one_bias_quant) /
             float_for_one_out_quant;
  }
};
#endif  // CUDA_VERSION >= 8000

template <typename SrcType, typename DstType, typename CmpType>
void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
                                       const OpContext& ctx,
                                       const std::vector<TBlob>& inputs,
                                       const std::vector<OpReqType>& req,
                                       const std::vector<TBlob>& outputs) {
#if CUDA_VERSION >= 8000
  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
  using namespace mshadow;
  using namespace mxnet_op;
  size_t num_inputs = param.no_bias ? 2 : 3;
  CHECK_EQ(inputs.size(), num_inputs * 3);
  CHECK_EQ(outputs.size(), 3U);
  Stream<gpu>* s = ctx.get_stream<gpu>();
  CHECK_EQ(s->blas_handle_ownership_, Stream<gpu>::OwnHandle);
  const TBlob& data    = inputs[0];
  const TBlob& weight  = inputs[1];
  const TBlob& out     = outputs[0];
  mxnet::TShape dshape = data.shape_;
  mxnet::TShape wshape = weight.shape_;
  mxnet::TShape oshape = out.shape_;
  // (m, n) * (k, n).T = (m, k)
  // A * B.T = C
  if (dshape.ndim() != 2) {
    CHECK(param.flatten) << "Currently, QuantizedFullyConnected Op only supports flatten=true "
                         << "when ishape.ndim()!=2 for GPU.";
  }

  // row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
  // row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
  const int m = dshape[0], n = dshape.ProdShape(1, dshape.ndim()), k = wshape[0];
  CmpType alpha               = 1.0f;
  CmpType beta                = 0.0f;
  const cudaDataType src_type = mshadow::DataType<SrcType>::kCudaFlag;
  const cudaDataType dst_type = mshadow::DataType<DstType>::kCudaFlag;
  const cudaDataType cmp_type = mshadow::DataType<CmpType>::kCudaFlag;
  CUBLAS_CALL(cublasGemmEx(s->blas_handle_,
                           CUBLAS_OP_T,
                           CUBLAS_OP_N,
                           k,
                           m,
                           n,
                           &alpha,
                           weight.dptr_,
                           src_type,
                           n,
                           data.dptr_,
                           src_type,
                           n,
                           &beta,
                           out.dptr_,
                           dst_type,
                           k,
                           cmp_type,
                           CUBLAS_GEMM_DFALT));

  Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(
      s,
      1,
      outputs[1].dptr<float>(),
      outputs[2].dptr<float>(),
      inputs[num_inputs].dptr<float>(),
      inputs[num_inputs + 1].dptr<float>(),
      inputs[num_inputs + 2].dptr<float>(),
      inputs[num_inputs + 3].dptr<float>());

  if (!param.no_bias) {
    const TBlob& bias = inputs[2];
    Kernel<QuantizedBiasAddKernel, gpu>::Launch(s,
                                                out.Size(),
                                                k,
                                                out.dptr<int32_t>(),
                                                bias.dptr<int8_t>(),
                                                outputs[1].dptr<float>(),
                                                outputs[2].dptr<float>(),
                                                inputs[7].dptr<float>(),
                                                inputs[8].dptr<float>());
  }
#else
  LOG(FATAL) << "QuantizedFullyConnectedForwardGPU only supports CUDA >= 8.0";
#endif  // CUDA_VERSION >= 8000
}

NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
    .set_attr<FCompute>("FCompute<gpu>",
                        QuantizedFullyConnectedForwardGPU<int8_t, int32_t, int32_t>);

}  // namespace op
}  // namespace mxnet