From 5030a65fb371f7adfd89d8fdd16f1201042545cd Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 15 Aug 2019 21:15:43 +0000
Subject: [PATCH 001/111] fixed example to use absolute path

---
 example/lib_api/test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/example/lib_api/test.py b/example/lib_api/test.py
index d73d85c02ced..840924c1317c 100644
--- a/example/lib_api/test.py
+++ b/example/lib_api/test.py
@@ -26,6 +26,8 @@
 import os
 
 if (os.name=='posix'):
-    mx.library.load('mylib.so')
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
 elif (os.name=='nt'):
-    mx.library.load('mylib.dll')
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)

From 23a226a36a6cdf69471bce8a9b318dff5e50f777 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 16 Aug 2019 07:17:03 +0000
Subject: [PATCH 002/111] added example for custom ops, added support for
 custom op registration

---
 Makefile                   |   2 +-
 example/lib_ops/Makefile   |  31 ++++++
 example/lib_ops/libtest.cc |  78 +++++++++++++++
 example/lib_ops/mylib.cc   | 107 ++++++++++++++++++++
 example/lib_ops/test.py    |  33 ++++++
 include/mxnet/lib_api.h    | 199 +++++++++++++++++++++++++++++++++++--
 src/c_api/c_api.cc         |  24 +++++
 7 files changed, 464 insertions(+), 10 deletions(-)
 create mode 100644 example/lib_ops/Makefile
 create mode 100644 example/lib_ops/libtest.cc
 create mode 100644 example/lib_ops/mylib.cc
 create mode 100644 example/lib_ops/test.py

diff --git a/Makefile b/Makefile
index a14c29452f6e..e3ccdac27943 100644
--- a/Makefile
+++ b/Makefile
@@ -660,7 +660,7 @@ pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 sample_lib:
-	$(CXX) -shared -fPIC example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
 
 doc: docs
 
diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
new file mode 100644
index 000000000000..f649a68eee9a
--- /dev/null
+++ b/example/lib_ops/Makefile
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all:
+	g++ -shared -fPIC -std=gnu++0x mylib.cc -o mylib.so -I ../../include/mxnet
+
+test:
+	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
+
+windows:
+	cl /LD mylib.cc
+
+win_test:
+	cl libtest.cc
+
+clean:
+	rm -rf mylib.so libtest
diff --git a/example/lib_ops/libtest.cc b/example/lib_ops/libtest.cc
new file mode 100644
index 000000000000..8bdf36c05d37
--- /dev/null
+++ b/example/lib_ops/libtest.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file libtest.cc
+ * \brief This test checks if the library is implemented correctly
+ * and does not involve dynamic loading of library into MXNet
+ * This test is supposed to be run before test.py
+ */
+
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <iostream>
+#include "lib_api.h"
+
+#define MXNET_VERSION 10500
+
+int main(void) {
+  // Get a handle to the library.
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  HINSTANCE handle;
+  handle = LoadLibrary(TEXT("mylib.dll"));
+#else
+  void *handle;
+  handle = dlopen("mylib.so", RTLD_LAZY);
+#endif
+
+  if (!handle) {
+    std::cerr << "Unable to load library" << std::endl;
+    return 1;
+  }
+
+  // get initialize function address from the library
+  initialize_t init_lib;
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  init_lib = (initialize_t) GetProcAddress(handle, MXLIB_INITIALIZE_STR);
+#else
+  init_lib = (initialize_t) dlsym(handle, MXLIB_INITIALIZE_STR);
+#endif
+
+  if (!init_lib) {
+    std::cerr << "Unable to get function 'intialize' from library" << std::endl;
+    return 1;
+  }
+
+  // Call the function.
+  (init_lib)(MXNET_VERSION);
+
+  // Deallocate memory.
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  FreeLibrary(handle);
+#else
+  dlclose(handle);
+#endif
+
+  return 0;
+}
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
new file mode 100644
index 000000000000..89a012d310d5
--- /dev/null
+++ b/example/lib_ops/mylib.cc
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file mylib.cc
+ * \brief Sample library file
+ */
+
+#include <iostream>
+#include "lib_api.h"
+
+void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
+  unsigned i,j,kk;
+  for (i=0;i<n;i++) {
+    for (j=0;j<m;j++) {
+      C[i*m+j] = 0;
+      for (kk=0;kk<k;kk++) {
+        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
+      }
+    }
+  }
+}
+
+int myFCompute(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+
+  double* input1 = inputs[0].getData<double>();
+  double* input2 = inputs[1].getData<double>();
+  double* output = outputs[0].getData<double>();
+  unsigned n = inputs[0].shape[0];
+  unsigned k = inputs[0].shape[1];
+  unsigned m = inputs[1].shape[1];
+
+  gemm(input1, input2, output, n, k, m);
+  
+  return 1;
+}
+
+int parseAttrs(std::map<std::string,std::string> attrs,
+               int* num_in, int* num_out) {
+
+  if(attrs.find("myParam") == attrs.end()) {
+    std::cout << "Missing param 'myParam'" << std::endl;
+    return 0;
+  }
+
+  *num_in = 2;
+  *num_out = 1;
+
+  return 1; //no error
+}
+
+int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+              std::vector<int> &outtypes) {
+  outtypes[0] = intypes[0];
+  
+  return 1; //no error
+}
+
+int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+               std::vector<std::vector<unsigned int>> &outshapes) {
+  unsigned n = inshapes[0][0];
+  unsigned k = inshapes[0][1];
+  unsigned kk = inshapes[1][0];
+  unsigned m = inshapes[1][1];
+
+  if(k != kk) return 0;
+  
+  outshapes[0].push_back(n);
+  outshapes[0].push_back(m);
+
+  return 1; //no error
+}
+
+REGISTER_OP(sam)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+int initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return 1;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return 0;
+  }
+}
+
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
new file mode 100644
index 000000000000..840924c1317c
--- /dev/null
+++ b/example/lib_ops/test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+
+import mxnet as mx
+import os
+
+if (os.name=='posix'):
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index ca3b2952eafa..8668ca483326 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -25,26 +25,207 @@
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
+#include <vector>
+#include <map>
+#include <stdint.h>
+
+/*!
+ * \brief External Tensor data types
+ */
+enum MXDType {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+/*!
+ * \brief External Tensor data structure
+ */
+struct MXTensor {
+  MXTensor() { data = nullptr; }
+  MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
+  : data{data}, shape{shape}, dtype{dtype} {}
+
+  /*!
+   * \brief helper function to cast data pointer
+   */
+  template<typename data_type>
+  data_type* getData() {
+    return (data_type*)data;
+  }
+
+  void *data; // not owned
+  std::vector<int64_t> shape;
+  MXDType dtype;
+};
+
+/*!
+ * Custom Operator function templates
+ */
+typedef int (*fcomp_t)(std::map<std::string,std::string>,
+                       std::vector<MXTensor>, std::vector<MXTensor>);
+typedef int (*parseAttrs_t)(std::map<std::string,std::string>,
+                            int*, int*);
+typedef int (*inferType_t)(std::map<std::string,std::string>,
+                           std::vector<int>&, std::vector<int>&);
+typedef int (*inferShape_t)(std::map<std::string,std::string>,
+                            std::vector<std::vector<unsigned int>>&,
+                            std::vector<std::vector<unsigned int>>&);
+
+/*!
+ * \brief Class to hold custom operator registration
+ */
+class CustomOp {
+ public:
+ CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
+    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
+  ~CustomOp() {}
+  CustomOp& setFCompute(fcomp_t fcomp) {
+    fcompute = fcomp;
+    return *this;
+  }
+  CustomOp& setParseAttrs(parseAttrs_t func) {
+    parse_attrs = func;
+    return *this;
+  }
+  CustomOp& setInferType(inferType_t func) {
+    infer_type = func;
+    return *this;
+  }
+  CustomOp& setInferShape(inferShape_t func) {
+    infer_shape = func;
+    return *this;
+  }
+  /*! \brief operator name */
+  const char* name;
+  /*! \brief operator functions */
+  fcomp_t fcompute;
+  parseAttrs_t parse_attrs;
+  inferType_t infer_type;
+  inferShape_t infer_shape;
+};
+
+/*!
+ * \brief Registry class to registers things (ops, properties)
+ *       Singleton class
+ */
+template <class T>
+class Registry {
+ public:
+  /*!
+   * \brief get singleton pointer to class
+   * \returns pointer to class
+   */
+  static Registry* get() {
+    static Registry inst;
+    return &inst;
+  }
+  /*!
+   * \brief add a new entry
+   * \returns new object associated with registered name
+   */
+  T& add(const char* name) {
+    T *entry = new T(name);
+    entries.push_back(entry);
+    return *entry;
+  }
+  int size() {
+    return entries.size();
+  }
+  T& get(int idx) {
+    return *(entries[idx]);
+  }
+ private:
+  /*! \brief constructor */
+  Registry() {}
+  /*! \brief destructor */
+  ~Registry() {}
+  /*! \brief map of entries in registry */
+  std::vector<T*> entries;
+};
+  
+
+/*
+ * Macros to help with string concat
+ * Annoyingly, the concat_ and concat macros are necessary to
+ * be able to use __COUNTER__ in an identifier name 
+ */
+#define _STR_CONCAT_(__a, __b) __a ## __b
+#define _STR_CONCAT(__a, __b) _STR_CONCAT_(__a, __b)
+
+/*!
+ * \brief convert a token to a string
+ */
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+/*!
+ * \brief declare a variable with custom name
+ */
+#define _REGISTER_NAME_(Name) MXNet ## _CustomOp ## _
+#define _REGISTER_DEF_(Name) CustomOp _REGISTER_NAME_(Name)
+
+/*!
+ * \brief assign a var to a value
+ */
+#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = Registry<CustomOp>::get()->add(TOSTRING(Name))
+
+
 /*!
  * \brief Following are the APIs implemented in the external library
  * Each API has a #define string that is used to lookup the function in the library
  * Followed by the function declaration
  */
+
+
+#define MXLIB_OPREGSIZE_STR "_opRegSize"
+typedef int (*opRegSize_t)(void);
+
+#define MXLIB_OPREGGET_STR "_opRegGet"
+typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
+                          parseAttrs_t*, inferType_t*,
+                          inferShape_t*);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
 extern "C" {
-    /*!
-     * \brief Checks if the MXNet version is supported by the library.
-     * If supported, initializes the library.
-     * \param version MXNet version number passed to library and defined as:
-     *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
-     * \return Non-zero value on error i.e. library incompatible with passed MXNet version
-     */
+  /*!
+   * \brief returns number of ops registered in this library
+   */
+  int _opRegSize() {
+    return Registry<CustomOp>::get()->size();
+  }
+
+  /*!
+   * \brief returns operator registration at specified index
+   */
+  void _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+                 parseAttrs_t* parse, inferType_t* type,
+                 inferShape_t* shape) {
+    CustomOp op = Registry<CustomOp>::get()->get(idx);
+    *name = op.name;
+    *fcomp = op.fcompute;
+    *parse = op.parse_attrs;
+    *type = op.infer_type;
+    *shape = op.infer_shape;
+  }
+
+  /*!
+   * \brief Checks if the MXNet version is supported by the library.
+   * If supported, initializes the library.
+   * \param version MXNet version number passed to library and defined as:
+   *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+   * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+   */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-    __declspec(dllexport) int __cdecl initialize(int);
+  __declspec(dllexport) int __cdecl initialize(int);
 #else
-    int initialize(int);
+  int initialize(int);
 #endif
 }
 #endif  // MXNET_LIB_API_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index c2b80b3f601c..25c6d80717c5 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -102,6 +102,30 @@ int MXLoadLib(const char *path) {
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
+
+  opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
+  int numOps = opRegSize();
+  LOG(INFO) << "Found " << numOps << " operators in library";
+
+  opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
+  for(int i=0; i<numOps; i++) {
+    const char* name;
+    fcomp_t fcomp = nullptr;
+    parseAttrs_t parse = nullptr;
+    inferType_t type = nullptr;
+    inferShape_t shape = nullptr;
+    
+    opRegGet(i,&name, &fcomp, &parse, &type, &shape);
+
+    CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
+    CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
+    CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
+    CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
+    
+    LOG(INFO) << "\tOp[" << i << "] " << name;
+
+  }
+  
   API_END();
 }
 

From 67c22c050334bbd910522de9cad69dff1067f50c Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sat, 17 Aug 2019 02:39:05 +0000
Subject: [PATCH 003/111] added fcompute registration for loaded operators
 moved library import order to after ndarray/symbol

---
 example/lib_ops/test.py  |  2 ++
 include/mxnet/lib_api.h  | 37 +++++++++++++++++++++++++++
 python/mxnet/__init__.py |  3 ++-
 python/mxnet/library.py  |  8 +++++-
 src/c_api/c_api.cc       | 55 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index 840924c1317c..8c7ccf02c886 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -31,3 +31,5 @@
 elif (os.name=='nt'):
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
+
+print(mx.nd.sam)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 8668ca483326..4ea284e39754 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,6 +190,11 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
+typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
+                             const int64_t**, int*, void**, int*, int,
+                             const int64_t**, int*, void**, int*, int);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -215,6 +220,38 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
+                      const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
+                      const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    //create a vector of tensors for inputs
+    std::vector<MXTensor> inputs(num_in);
+    for(int i=0; i<num_in; i++) {
+      inputs[i].data = indata[i];
+      inputs[i].dtype = (MXDType)intypes[i];
+      for(int j=0; j<indims[i]; j++) {
+        inputs[i].shape.push_back(inshapes[i][j]);
+      }
+    }
+
+    //create a vector of tensors for outputs
+    std::vector<MXTensor> outputs(num_out);
+    for(int i=0; i<num_out; i++) {
+      outputs[i].data = outdata[i];
+      outputs[i].dtype = (MXDType)outtypes[i];
+      for(int j=0; j<outdims[i]; j++) {
+        outputs[i].shape.push_back(outshapes[i][j]);
+      }
+    }
+
+    return fcomp(attrs,inputs,outputs);
+  }
+  
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e9c1229d7f2f..c76d78f54331 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -27,7 +27,6 @@
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
 from .util import is_np_array, np_array, use_np_array, use_np
 from . import base
-from . import library
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
@@ -87,6 +86,8 @@
 
 from . import gluon
 
+from . import library
+
 __version__ = base.__version__
 
 # Dist kvstore module which launches a separate process when role is set to "server".
diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index 9ebf2c2bc580..ec6e47d93f12 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -20,7 +20,9 @@
 from __future__ import absolute_import
 import ctypes
 import os
-from .base import _LIB, check_call, MXNetError
+from .base import _LIB, check_call, MXNetError, _init_op_module
+from .ndarray.register import _make_ndarray_function
+from .symbol.register import _make_symbol_function
 
 def load(path):
     """Loads library dynamically.
@@ -47,3 +49,7 @@ def load(path):
     byt_obj = path.encode('utf-8')
     chararr = ctypes.c_char_p(byt_obj)
     check_call(_LIB.MXLoadLib(chararr))
+
+    #regenerate operators
+    _init_op_module('mxnet', 'ndarray', _make_ndarray_function)
+    _init_op_module('mxnet', 'symbol', _make_symbol_function)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 25c6d80717c5..07c7cef2cdc9 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -99,14 +99,20 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
+  //initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
+  //get function to call fcompute
+  opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
+  
+  //get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
+  //loop and register each operator in the library
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for(int i=0; i<numOps; i++) {
     const char* name;
@@ -114,9 +120,11 @@ int MXLoadLib(const char *path) {
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
-    
+
+    //get operator from the library
     opRegGet(i,&name, &fcomp, &parse, &type, &shape);
 
+    //validate operator in the library
     CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
     CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
@@ -124,6 +132,51 @@ int MXLoadLib(const char *path) {
     
     LOG(INFO) << "\tOp[" << i << "] " << name;
 
+    std::string name_str(name);
+    //generate lambda functions to convert from MXNet types to external types
+    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys,attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
+      std::vector<void*> in_data, out_data;
+      std::vector<const int64_t *> in_shapes, out_shapes;
+      std::vector<int> in_dims, out_dims;
+      std::vector<int> in_types, out_types;
+
+      //convert input tensors to constituant parts
+      for(size_t i=0; i<inputs.size(); i++) {
+        in_data.push_back(inputs[i].dptr_);
+        in_shapes.push_back(inputs[i].shape_.data());
+        in_dims.push_back(inputs[i].shape_.ndim());
+        in_types.push_back(inputs[i].type_flag_);
+      }
+
+      //convert output tensors to constituant parts
+      for(size_t i=0; i<outputs.size(); i++) {
+        out_data.push_back(outputs[i].dptr_);
+        out_shapes.push_back(outputs[i].shape_.data());
+        out_dims.push_back(outputs[i].shape_.ndim());
+        out_types.push_back(outputs[i].type_flag_);
+      }
+
+      //call fcompute function
+      CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                      in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
+                      out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
+            << "Error calling FCompute for custom operator '" << name_str << "'";
+    };
+
+    //re-register op in MXNet using lambda converter functions
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
   }
   
   API_END();

From 915c1d5707c5af7509fce85ad9a59fe8f009028d Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sat, 17 Aug 2019 05:58:56 +0000
Subject: [PATCH 004/111] changed dynamic ops to be contrib

---
 example/lib_ops/test.py | 2 +-
 src/c_api/c_api.cc      | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index 8c7ccf02c886..bb2db9ab4be6 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -32,4 +32,4 @@
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
 
-print(mx.nd.sam)
+print(mx.nd.contrib.sam)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 07c7cef2cdc9..ef422fd57fd6 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,3 +1,4 @@
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -175,7 +176,9 @@ int MXLoadLib(const char *path) {
     };
 
     //re-register op in MXNet using lambda converter functions
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    std::string contrib_name("_contrib_");
+    contrib_name += name;
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
   }
   

From f568e3d3e53702be44fe4c8564be9acb6f2817af Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sun, 18 Aug 2019 07:41:27 +0000
Subject: [PATCH 005/111] added num in/out

---
 include/mxnet/lib_api.h | 16 +++++++++++++
 src/c_api/c_api.cc      | 53 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 4ea284e39754..47ed086333fd 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,6 +190,10 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLPARSEATTRS_STR "_opCallParseAttrs"
+typedef int (*opCallParseAttrs_t)(parseAttrs_t, const char* const*, const char* const*, int,
+                                  int*, int*);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -220,6 +224,18 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
+                        int* num_in, int* num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    return parseAttrs(attrs,num_in,num_out);
+  }
+  
+  
   int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
                       const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ef422fd57fd6..e2aec8009cec 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -105,7 +105,8 @@ int MXLoadLib(const char *path) {
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  //get function to call fcompute
+  //get call functions
+  opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
   opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
   //get number of operators registered in the library
@@ -132,14 +133,26 @@ int MXLoadLib(const char *path) {
     CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
     
     LOG(INFO) << "\tOp[" << i << "] " << name;
-
     std::string name_str(name);
-    //generate lambda functions to convert from MXNet types to external types
-    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
+
+    auto num_inputs = [=](const NodeAttrs& attrs) {
+      //convert attributes to vector of char
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      
+      return num_in;
+    };
+
+    auto num_outputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char*
       std::vector<const char*> attr_keys,attr_vals;
       for(auto kv : attrs.dict) {
@@ -147,6 +160,28 @@ int MXLoadLib(const char *path) {
         attr_vals.push_back(kv.second.c_str());
       }
       
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      
+      return num_out;
+    };
+
+    // lambda function to convert from external fcompute to internal MXNet types
+    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
       std::vector<void*> in_data, out_data;
       std::vector<const int64_t *> in_shapes, out_shapes;
       std::vector<int> in_dims, out_dims;
@@ -180,6 +215,8 @@ int MXLoadLib(const char *path) {
     contrib_name += name;
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
+    regOp.set_num_inputs(num_inputs);
+    regOp.set_num_outputs(num_outputs);
   }
   
   API_END();

From 8e12588f4d178a579f38c0e60420c0eb673f7b84 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 05:26:15 +0000
Subject: [PATCH 006/111] removed contrib op registration re-registered ops
 from mx.nd.op to mx.nd

---
 python/mxnet/library.py | 15 +++++++++++++++
 src/c_api/c_api.cc      |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index ec6e47d93f12..fcf39815d8e9 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -19,6 +19,7 @@
 """Library management API of mxnet."""
 from __future__ import absolute_import
 import ctypes
+import sys
 import os
 from .base import _LIB, check_call, MXNetError, _init_op_module
 from .ndarray.register import _make_ndarray_function
@@ -53,3 +54,17 @@ def load(path):
     #regenerate operators
     _init_op_module('mxnet', 'ndarray', _make_ndarray_function)
     _init_op_module('mxnet', 'symbol', _make_symbol_function)
+
+    #re-register mx.nd.op into mx.nd
+    mx_nd = sys.modules["mxnet.ndarray"]
+    mx_nd_op = sys.modules["mxnet.ndarray.op"]
+    for op in dir(mx_nd_op):
+        func = getattr(mx_nd_op,op)
+        setattr(mx_nd,op,func)
+
+    #re-register mx.sym.op into mx.sym
+    mx_sym = sys.modules["mxnet.symbol"]
+    mx_sym_op = sys.modules["mxnet.symbol.op"]
+    for op in dir(mx_sym_op):
+        func = getattr(mx_sym_op,op)
+        setattr(mx_sym,op,func)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index e2aec8009cec..28c3de5b3234 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -211,7 +211,8 @@ int MXLoadLib(const char *path) {
     };
 
     //re-register op in MXNet using lambda converter functions
-    std::string contrib_name("_contrib_");
+    //std::string contrib_name("_contrib_");
+    std::string contrib_name("");
     contrib_name += name;
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);

From 1e27a47857dfdc3bb7830582ab07ddc777d6aac5 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 06:59:57 +0000
Subject: [PATCH 007/111] added support for infer shape, updated example to
 call operator

---
 example/lib_ops/mylib.cc |  46 ++++++++++++---
 example/lib_ops/test.py  |  12 +++-
 include/mxnet/lib_api.h  |  66 ++++++++++++++++++++-
 src/c_api/c_api.cc       | 123 +++++++++++++++++++++++++++++++++++----
 4 files changed, 226 insertions(+), 21 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 89a012d310d5..5916c9d46683 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -26,7 +26,10 @@
 #include <iostream>
 #include "lib_api.h"
 
-void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
+/*
+ * main matrix multiplication routine
+ */
+void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   unsigned i,j,kk;
   for (i=0;i<n;i++) {
     for (j=0;j<m;j++) {
@@ -38,29 +41,39 @@ void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
   }
 }
 
+
 int myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+  //validate inputs
+  for(int i=0; i<inputs.size(); i++) {
+    if(inputs[i].dtype != kFloat32) {
+      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
+      return 0;
+    }
+  }
 
-  double* input1 = inputs[0].getData<double>();
-  double* input2 = inputs[1].getData<double>();
-  double* output = outputs[0].getData<double>();
+  //extract data pointers from tensors
+  float* input1 = inputs[0].getData<float>();
+  float* input2 = inputs[1].getData<float>();
+  float* output = outputs[0].getData<float>();
+  //set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
   unsigned m = inputs[1].shape[1];
 
   gemm(input1, input2, output, n, k, m);
   
-  return 1;
+  return 1; //no error
 }
 
 int parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
-
+  /*
   if(attrs.find("myParam") == attrs.end()) {
     std::cout << "Missing param 'myParam'" << std::endl;
     return 0;
   }
-
+  */
   *num_in = 2;
   *num_out = 1;
 
@@ -76,11 +89,30 @@ int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes
 
 int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
+  //validate inputs
+  if(inshapes.size() != 2) {
+    std::cout << "Expected 2 inputs to inferShape" << std::endl;
+    return 0;
+  }
+
+  if(inshapes[0].size() != 2) {
+    std::cout << "Expected 2D for first input to inferShape" << std::endl;
+    return 0;
+  }
+
+  if(inshapes[1].size() != 2) {
+    std::cout << "Expected 2D for second input to inferShape" << std::endl;
+    return 0;
+  }
+  
   unsigned n = inshapes[0][0];
   unsigned k = inshapes[0][1];
   unsigned kk = inshapes[1][0];
   unsigned m = inshapes[1][1];
 
+  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
+  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
+  
   if(k != kk) return 0;
   
   outshapes[0].push_back(n);
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index bb2db9ab4be6..cdf78bc5c9c3 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -25,6 +25,7 @@
 import mxnet as mx
 import os
 
+#load library
 if (os.name=='posix'):
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
@@ -32,4 +33,13 @@
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
 
-print(mx.nd.contrib.sam)
+#setup inputs to call test operator
+a = mx.nd.array([[1,2],[3,4]])
+b = mx.nd.array([[5,6],[7,8]])
+
+#print inputs
+print(a)
+print(b)
+
+#compute and print output
+print(mx.nd.sam(a,b))
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 47ed086333fd..e19d67cccee4 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,10 +190,18 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLFREE_STR "_opCallFree"
+typedef int (*opCallFree_t)(void*);
+
 #define MXLIB_OPCALLPARSEATTRS_STR "_opCallParseAttrs"
 typedef int (*opCallParseAttrs_t)(parseAttrs_t, const char* const*, const char* const*, int,
                                   int*, int*);
 
+#define MXLIB_OPCALLINFERSHAPE_STR "_opCallInferShape"
+typedef int (*opCallInferShape_t)(inferShape_t, const char* const*, const char* const*, int,
+                                  unsigned int**, int*, int,
+                                  unsigned int***, int**, int);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -224,6 +232,16 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  /*!
+   * \brief calls free from the external library for library allocated arrays
+   */
+  void _opCallFree(void* ptr) {
+    free(ptr);
+  }
+
+  /*!
+   * \brief returns status of calling parse attributes function for operator from library
+   */
   int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
                         int* num_in, int* num_out) {
     //create map of attributes from list
@@ -234,8 +252,52 @@ extern "C" {
 
     return parseAttrs(attrs,num_in,num_out);
   }
-  
-  
+
+  /*!
+   * \brief returns status of calling infer shape function for operator from library
+   */
+  int _opCallInferShape(inferShape_t inferShape, const char* const* keys, const char* const* vals, int num,
+                        unsigned int** inshapes, int* indims, int num_in,
+                        unsigned int*** outshapes, int** outdims, int num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    //create a vector of shapes for inputs
+    std::vector<std::vector<unsigned int> > in_shapes(num_in);
+    for(int i=0; i<num_in; i++) {
+      for(int j=0; j<indims[i]; j++) {
+        in_shapes[i].push_back(inshapes[i][j]);
+      }
+    }
+
+    //create a vector of shapes for outputs
+    std::vector<std::vector<unsigned int> > out_shapes(num_out);
+
+    int retval = inferShape(attrs,in_shapes,out_shapes);
+    if(!retval) return retval;
+
+    //allocate space for output dims, shape
+    *outdims = (int*)malloc(num_out*sizeof(int));
+    *outshapes = (unsigned**)malloc(num_out*sizeof(unsigned*));
+
+    //copy output shapes
+    for(int i=0; i<num_out; i++) {
+      (*outdims)[i] = out_shapes[i].size();
+      (*outshapes)[i] = (unsigned*)malloc((*outdims)[i]*sizeof(unsigned));
+      for(int j=0; j<indims[i]; j++) {
+        (*outshapes)[i][j] = out_shapes[i][j];
+      }
+    }
+
+    return retval;
+  }
+
+  /*!
+   * \brief returns status of calling FCompute function for operator from library
+   */
   int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
                       const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 28c3de5b3234..2865d6635c4b 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -106,7 +106,9 @@ int MXLoadLib(const char *path) {
     LOG(FATAL) << "Library failed to initialize";
 
   //get call functions
+  opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
   opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
+  opCallInferShape_t callInferShape = get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
   opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
   //get number of operators registered in the library
@@ -126,7 +128,7 @@ int MXLoadLib(const char *path) {
     //get operator from the library
     opRegGet(i,&name, &fcomp, &parse, &type, &shape);
 
-    //validate operator in the library
+    //validate operator functions from the library
     CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
     CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
@@ -135,6 +137,31 @@ int MXLoadLib(const char *path) {
     LOG(INFO) << "\tOp[" << i << "] " << name;
     std::string name_str(name);
 
+    /*
+     * Below are a series of lambda functions that will be registered in the NNVM op registration
+     * Each one has the standard MXNet signature and converts to types supported by externally
+     * registered operators. 
+     */
+    
+    //lambda function to call parse attributes
+    auto attr_parser = [=](const NodeAttrs* attrs) {
+      //convert attributes to vector of char
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs->dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+
+      //return type void
+    };
+
+    //lambda function to call parse attributes and return the number of inputs
     auto num_inputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
@@ -147,11 +174,12 @@ int MXLoadLib(const char *path) {
       int num_out=-1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
-      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
       
       return num_in;
     };
 
+    //lambda function to call parse attributes and return the number of outputs
     auto num_outputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char*
       std::vector<const char*> attr_keys,attr_vals;
@@ -164,11 +192,82 @@ int MXLoadLib(const char *path) {
       int num_out=-1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
-      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
       
       return num_out;
     };
 
+    //lambda function to call infer shape
+    auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector *in_shape,
+                            mxnet::ShapeVector *out_shape) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      std::vector<uint32_t*> inshapes(in_shape->size());
+      std::vector<int> indims(in_shape->size());
+
+      //determine amount of memory needed to store all the input shapes
+      size_t buff_size = 0;
+      for (const auto& i : *in_shape) buff_size += i.ndim();
+
+      //copy input shapes from ShapeVector to raw memory layout
+      std::vector<uint32_t> inbuff(buff_size);
+      uint32_t *ptr = inbuff.data();
+      for (size_t i = 0; i < in_shape->size(); ++i) {
+        inshapes[i] = ptr;
+        indims[i] = (*in_shape)[i].ndim();
+        for (int j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
+          *ptr = static_cast<uint32_t>((*in_shape)[i][j]);
+        }
+      }
+
+      //output shapes will be allocated by infer shape function
+      uint32_t** outshapes = nullptr;
+      int* outdims = nullptr;
+
+      CHECK(callInferShape(shape, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           inshapes.data(), indims.data(), in_shape->size(),
+                           &outshapes, &outdims, out_shape->size()))
+      << "Error calling InferShape for custom operator '" << name_str << "'";
+
+      std::vector<uint32_t*> out_shapes(out_shape->size());
+      //determine amount of memory needed to store all the output shapes
+      buff_size = 0;
+      for (unsigned i=0; i<out_shape->size(); i++) {
+        buff_size += outdims[i];
+      }
+
+      //copy output shapes from custom op memory to MXNet memory
+      std::vector<uint32_t> outbuff(buff_size);
+      ptr = outbuff.data();
+      for (unsigned i = 0; i < out_shape->size(); ++i) {
+        out_shapes[i] = ptr;
+        for (int j = 0; j < outdims[i]; ++j, ++ptr) {
+          *ptr = static_cast<uint32_t>(outshapes[i][j]);
+        }
+      }
+
+      //assign output shapes to ShapeVector
+      for (unsigned i = 0; i < out_shape->size(); ++i) {
+        SHAPE_ASSIGN_CHECK(*out_shape, i,
+                           mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
+      }
+
+      //free memory used by custom op to allocate shapes/dims
+      callFree(outdims);
+      for(unsigned i=0; i<out_shape->size(); i++) {
+        callFree(outshapes[i]);
+      }
+      callFree(outshapes);
+      
+      return true;
+    };
+    
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -187,7 +286,7 @@ int MXLoadLib(const char *path) {
       std::vector<int> in_dims, out_dims;
       std::vector<int> in_types, out_types;
 
-      //convert input tensors to constituant parts
+      //convert input tensors to constituent parts
       for(size_t i=0; i<inputs.size(); i++) {
         in_data.push_back(inputs[i].dptr_);
         in_shapes.push_back(inputs[i].shape_.data());
@@ -195,7 +294,7 @@ int MXLoadLib(const char *path) {
         in_types.push_back(inputs[i].type_flag_);
       }
 
-      //convert output tensors to constituant parts
+      //convert output tensors to constituent parts
       for(size_t i=0; i<outputs.size(); i++) {
         out_data.push_back(outputs[i].dptr_);
         out_shapes.push_back(outputs[i].shape_.data());
@@ -207,17 +306,19 @@ int MXLoadLib(const char *path) {
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
-            << "Error calling FCompute for custom operator '" << name_str << "'";
+      << "Error calling FCompute for custom operator '" << name_str << "'";
+
+      //return type void
     };
 
     //re-register op in MXNet using lambda converter functions
-    //std::string contrib_name("_contrib_");
-    std::string contrib_name("");
-    contrib_name += name;
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
-    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
+    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
+    regOp.add_argument("data", "NDArray[]", "Source inputs");
+    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);      
   }
   
   API_END();

From 9aecf8646b4047796c255cb4fc1fde42827fb61e Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 07:47:47 +0000
Subject: [PATCH 008/111] fixed whitespace

---
 include/mxnet/lib_api.h | 101 ++++++++++++++++++----------------
 src/c_api/c_api.cc      | 118 ++++++++++++++++++++++------------------
 2 files changed, 119 insertions(+), 100 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index e19d67cccee4..1f17b0344586 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -25,9 +25,10 @@
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
+#include <stdint.h>
 #include <vector>
 #include <map>
-#include <stdint.h>
+#include <string>
 
 /*!
  * \brief External Tensor data types
@@ -55,10 +56,10 @@ struct MXTensor {
    */
   template<typename data_type>
   data_type* getData() {
-    return (data_type*)data;
+    return reinterpret_cast<data_type*>(data);
   }
 
-  void *data; // not owned
+  void *data;  // not owned
   std::vector<int64_t> shape;
   MXDType dtype;
 };
@@ -66,13 +67,13 @@ struct MXTensor {
 /*!
  * Custom Operator function templates
  */
-typedef int (*fcomp_t)(std::map<std::string,std::string>,
+typedef int (*fcomp_t)(std::map<std::string, std::string>,
                        std::vector<MXTensor>, std::vector<MXTensor>);
-typedef int (*parseAttrs_t)(std::map<std::string,std::string>,
+typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
                             int*, int*);
-typedef int (*inferType_t)(std::map<std::string,std::string>,
+typedef int (*inferType_t)(std::map<std::string, std::string>,
                            std::vector<int>&, std::vector<int>&);
-typedef int (*inferShape_t)(std::map<std::string,std::string>,
+typedef int (*inferShape_t)(std::map<std::string, std::string>,
                             std::vector<std::vector<unsigned int>>&,
                             std::vector<std::vector<unsigned int>>&);
 
@@ -80,8 +81,8 @@ typedef int (*inferShape_t)(std::map<std::string,std::string>,
  * \brief Class to hold custom operator registration
  */
 class CustomOp {
- public:
- CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
+  public:
+  explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
     parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
   ~CustomOp() {}
   CustomOp& setFCompute(fcomp_t fcomp) {
@@ -139,6 +140,7 @@ class Registry {
   T& get(int idx) {
     return *(entries[idx]);
   }
+  
  private:
   /*! \brief constructor */
   Registry() {}
@@ -146,8 +148,7 @@ class Registry {
   ~Registry() {}
   /*! \brief map of entries in registry */
   std::vector<T*> entries;
-};
-  
+};  
 
 /*
  * Macros to help with string concat
@@ -172,7 +173,8 @@ class Registry {
 /*!
  * \brief assign a var to a value
  */
-#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = Registry<CustomOp>::get()->add(TOSTRING(Name))
+#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
+    Registry<CustomOp>::get()->add(TOSTRING(Name))
 
 
 /*!
@@ -242,52 +244,54 @@ extern "C" {
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
-  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
+  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+                        const char* const* vals, int num,
                         int* num_in, int* num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    return parseAttrs(attrs,num_in,num_out);
+    return parseAttrs(attrs, num_in, num_out);
   }
 
   /*!
    * \brief returns status of calling infer shape function for operator from library
    */
-  int _opCallInferShape(inferShape_t inferShape, const char* const* keys, const char* const* vals, int num,
+  int _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+                        const char* const* vals, int num,
                         unsigned int** inshapes, int* indims, int num_in,
                         unsigned int*** outshapes, int** outdims, int num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    //create a vector of shapes for inputs
+    // create a vector of shapes for inputs
     std::vector<std::vector<unsigned int> > in_shapes(num_in);
-    for(int i=0; i<num_in; i++) {
-      for(int j=0; j<indims[i]; j++) {
+    for (int i = 0; i < num_in; i++) {
+      for (int j = 0; j < indims[i]; j++) {
         in_shapes[i].push_back(inshapes[i][j]);
       }
     }
 
-    //create a vector of shapes for outputs
+    // create a vector of shapes for outputs
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
-    int retval = inferShape(attrs,in_shapes,out_shapes);
+    int retval = inferShape(attrs, in_shapes, out_shapes);
     if(!retval) return retval;
 
-    //allocate space for output dims, shape
-    *outdims = (int*)malloc(num_out*sizeof(int));
-    *outshapes = (unsigned**)malloc(num_out*sizeof(unsigned*));
+    // allocate space for output dims, shape
+    *outdims = (int*) malloc (num_out * sizeof(int));
+    *outshapes = (unsigned**) malloc (num_out * sizeof(unsigned*));
 
-    //copy output shapes
-    for(int i=0; i<num_out; i++) {
+    // copy output shapes
+    for (int i = 0; i < num_out; i++) {
       (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = (unsigned*)malloc((*outdims)[i]*sizeof(unsigned));
-      for(int j=0; j<indims[i]; j++) {
+      (*outshapes)[i] = (unsigned*) malloc ((*outdims)[i] * sizeof(unsigned));
+      for (int j = 0; j < indims[i]; j++) {
         (*outshapes)[i][j] = out_shapes[i][j];
       }
     }
@@ -298,36 +302,39 @@ extern "C" {
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
-  int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
-                      const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
-                      const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+  int _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+                      const char* const* vals, int num,
+                      const int64_t** inshapes, int* indims,
+                      void** indata, int* intypes, int num_in,
+                      const int64_t** outshapes, int* outdims,
+                      void** outdata, int* outtypes, int num_out) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    //create a vector of tensors for inputs
+    // create a vector of tensors for inputs
     std::vector<MXTensor> inputs(num_in);
-    for(int i=0; i<num_in; i++) {
+    for (int i = 0; i < num_in; i++) {
       inputs[i].data = indata[i];
       inputs[i].dtype = (MXDType)intypes[i];
-      for(int j=0; j<indims[i]; j++) {
+      for (int j = 0; j < indims[i]; j++) {
         inputs[i].shape.push_back(inshapes[i][j]);
       }
     }
 
-    //create a vector of tensors for outputs
+    // create a vector of tensors for outputs
     std::vector<MXTensor> outputs(num_out);
-    for(int i=0; i<num_out; i++) {
+    for (int i = 0; i < num_out; i++) {
       outputs[i].data = outdata[i];
-      outputs[i].dtype = (MXDType)outtypes[i];
-      for(int j=0; j<outdims[i]; j++) {
+      outputs[i].dtype = (MXDType) outtypes[i];
+      for (int j = 0; j < outdims[i]; j++) {
         outputs[i].shape.push_back(outshapes[i][j]);
       }
     }
 
-    return fcomp(attrs,inputs,outputs);
+    return fcomp(attrs, inputs, outputs);
   }
   
   /*!
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2865d6635c4b..2dd7866135ec 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -100,40 +100,50 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
-  //initialize library by passing MXNet version
+  // initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  //get call functions
+  // get call functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
-  opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
-  opCallInferShape_t callInferShape = get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
-  opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
-  //get number of operators registered in the library
+  opCallParseAttrs_t callParseAttrs =
+    get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
+  
+  opCallInferShape_t callInferShape =
+    get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
+  
+  opCallFComp_t callFComp =
+    get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
+
+  // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
-  //loop and register each operator in the library
+  // loop and register each operator in the library
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
-  for(int i=0; i<numOps; i++) {
+  for (int i = 0; i < numOps; i++) {
     const char* name;
     fcomp_t fcomp = nullptr;
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
 
-    //get operator from the library
-    opRegGet(i,&name, &fcomp, &parse, &type, &shape);
+    // get operator from the library
+    opRegGet(i, &name, &fcomp, &parse, &type, &shape);
+
+    // validate operator functions from the library
+    CHECK(fcomp != nullptr) << "Error loading '" << name
+                            << "' custom op, FCompute function was not set.";
+    CHECK(parse != nullptr) << "Error loading '" << name
+                            << "' custom op, ParseAttrs function was not set.";
+    CHECK(type  != nullptr) << "Error loading '" << name
+                            << "' custom op, InferType function was not set.";
+    CHECK(shape != nullptr) << "Error loading '" << name
+                            << "' custom op, InferShape function was not set.";
 
-    //validate operator functions from the library
-    CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
-    CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
-    CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
-    CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
-    
     LOG(INFO) << "\tOp[" << i << "] " << name;
     std::string name_str(name);
 
@@ -143,35 +153,35 @@ int MXLoadLib(const char *path) {
      * registered operators. 
      */
     
-    //lambda function to call parse attributes
+    // lambda function to call parse attributes
     auto attr_parser = [=](const NodeAttrs* attrs) {
-      //convert attributes to vector of char
+      // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs->dict) {
+      for (auto kv : attrs->dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
       
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs for custom operator '" << name_str << "'";
 
-      //return type void
+      // return type void
     };
 
-    //lambda function to call parse attributes and return the number of inputs
+    // lambda function to call parse attributes and return the number of inputs
     auto num_inputs = [=](const NodeAttrs& attrs) {
-      //convert attributes to vector of char
+      // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
 
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
@@ -179,17 +189,17 @@ int MXLoadLib(const char *path) {
       return num_in;
     };
 
-    //lambda function to call parse attributes and return the number of outputs
+    // lambda function to call parse attributes and return the number of outputs
     auto num_outputs = [=](const NodeAttrs& attrs) {
-      //convert attributes to vector of char*
-      std::vector<const char*> attr_keys,attr_vals;
-      for(auto kv : attrs.dict) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
       
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
@@ -197,13 +207,13 @@ int MXLoadLib(const char *path) {
       return num_out;
     };
 
-    //lambda function to call infer shape
+    // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
                             mxnet::ShapeVector *out_shape) {
-      //convert attributes to vector of char*
+      // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -211,11 +221,11 @@ int MXLoadLib(const char *path) {
       std::vector<uint32_t*> inshapes(in_shape->size());
       std::vector<int> indims(in_shape->size());
 
-      //determine amount of memory needed to store all the input shapes
+      // determine amount of memory needed to store all the input shapes
       size_t buff_size = 0;
       for (const auto& i : *in_shape) buff_size += i.ndim();
 
-      //copy input shapes from ShapeVector to raw memory layout
+      // copy input shapes from ShapeVector to raw memory layout
       std::vector<uint32_t> inbuff(buff_size);
       uint32_t *ptr = inbuff.data();
       for (size_t i = 0; i < in_shape->size(); ++i) {
@@ -236,13 +246,13 @@ int MXLoadLib(const char *path) {
       << "Error calling InferShape for custom operator '" << name_str << "'";
 
       std::vector<uint32_t*> out_shapes(out_shape->size());
-      //determine amount of memory needed to store all the output shapes
+      // determine amount of memory needed to store all the output shapes
       buff_size = 0;
       for (unsigned i=0; i<out_shape->size(); i++) {
         buff_size += outdims[i];
       }
 
-      //copy output shapes from custom op memory to MXNet memory
+      // copy output shapes from custom op memory to MXNet memory
       std::vector<uint32_t> outbuff(buff_size);
       ptr = outbuff.data();
       for (unsigned i = 0; i < out_shape->size(); ++i) {
@@ -258,9 +268,9 @@ int MXLoadLib(const char *path) {
                            mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
       }
 
-      //free memory used by custom op to allocate shapes/dims
+      // free memory used by custom op to allocate shapes/dims
       callFree(outdims);
-      for(unsigned i=0; i<out_shape->size(); i++) {
+      for (unsigned i = 0; i < out_shape->size(); i++) {
         callFree(outshapes[i]);
       }
       callFree(outshapes);
@@ -274,9 +284,9 @@ int MXLoadLib(const char *path) {
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
-      //convert attributes to vector of char*
+      // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -286,32 +296,34 @@ int MXLoadLib(const char *path) {
       std::vector<int> in_dims, out_dims;
       std::vector<int> in_types, out_types;
 
-      //convert input tensors to constituent parts
-      for(size_t i=0; i<inputs.size(); i++) {
+      // convert input tensors to constituent parts
+      for (size_t i = 0; i < inputs.size(); i++) {
         in_data.push_back(inputs[i].dptr_);
         in_shapes.push_back(inputs[i].shape_.data());
         in_dims.push_back(inputs[i].shape_.ndim());
         in_types.push_back(inputs[i].type_flag_);
       }
 
-      //convert output tensors to constituent parts
-      for(size_t i=0; i<outputs.size(); i++) {
+      // convert output tensors to constituent parts
+      for (size_t i = 0; i < outputs.size(); i++) {
         out_data.push_back(outputs[i].dptr_);
         out_shapes.push_back(outputs[i].shape_.data());
         out_dims.push_back(outputs[i].shape_.ndim());
         out_types.push_back(outputs[i].type_flag_);
       }
 
-      //call fcompute function
+      // call fcompute function
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
-                      in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
-                      out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
+                      in_shapes.data(), in_dims.data(), in_data.data(),
+                      in_types.data(), in_data.size(),
+                      out_shapes.data(), out_dims.data(), out_data.data(),
+                      out_types.data(), out_data.size()))
       << "Error calling FCompute for custom operator '" << name_str << "'";
 
-      //return type void
+      // return type void
     };
 
-    //re-register op in MXNet using lambda converter functions
+    // re-register op in MXNet using lambda converter functions
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
     regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);

From 02deacfd61080d21b186226db0ac6329ce9de939 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 08:00:02 +0000
Subject: [PATCH 009/111] fixed whitespace

---
 include/mxnet/lib_api.h | 16 ++++++++--------
 src/c_api/c_api.cc      | 32 ++++++++++++++++----------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 1f17b0344586..0d5fa849d431 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -81,7 +81,7 @@ typedef int (*inferShape_t)(std::map<std::string, std::string>,
  * \brief Class to hold custom operator registration
  */
 class CustomOp {
-  public:
+ public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
     parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
   ~CustomOp() {}
@@ -140,7 +140,7 @@ class Registry {
   T& get(int idx) {
     return *(entries[idx]);
   }
-  
+
  private:
   /*! \brief constructor */
   Registry() {}
@@ -148,7 +148,7 @@ class Registry {
   ~Registry() {}
   /*! \brief map of entries in registry */
   std::vector<T*> entries;
-};  
+};
 
 /*
  * Macros to help with string concat
@@ -281,16 +281,16 @@ extern "C" {
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
     int retval = inferShape(attrs, in_shapes, out_shapes);
-    if(!retval) return retval;
+    if (!retval) return retval;
 
     // allocate space for output dims, shape
-    *outdims = (int*) malloc (num_out * sizeof(int));
-    *outshapes = (unsigned**) malloc (num_out * sizeof(unsigned*));
+    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
+    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
 
     // copy output shapes
     for (int i = 0; i < num_out; i++) {
       (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = (unsigned*) malloc ((*outdims)[i] * sizeof(unsigned));
+      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
       for (int j = 0; j < indims[i]; j++) {
         (*outshapes)[i][j] = out_shapes[i][j];
       }
@@ -336,7 +336,7 @@ extern "C" {
 
     return fcomp(attrs, inputs, outputs);
   }
-  
+
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2dd7866135ec..6549acd184fa 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -107,13 +107,13 @@ int MXLoadLib(const char *path) {
 
   // get call functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
-  
+
   opCallParseAttrs_t callParseAttrs =
     get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
-  
+
   opCallInferShape_t callInferShape =
     get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
-  
+
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
@@ -152,7 +152,7 @@ int MXLoadLib(const char *path) {
      * Each one has the standard MXNet signature and converts to types supported by externally
      * registered operators. 
      */
-    
+
     // lambda function to call parse attributes
     auto attr_parser = [=](const NodeAttrs* attrs) {
       // convert attributes to vector of char
@@ -161,7 +161,7 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       int num_in = -1;
       int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
@@ -185,7 +185,7 @@ int MXLoadLib(const char *path) {
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
-      
+
       return num_in;
     };
 
@@ -197,13 +197,13 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       int num_in = -1;
       int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
-      
+
       return num_out;
     };
 
@@ -236,7 +236,7 @@ int MXLoadLib(const char *path) {
         }
       }
 
-      //output shapes will be allocated by infer shape function
+      // output shapes will be allocated by infer shape function
       uint32_t** outshapes = nullptr;
       int* outdims = nullptr;
 
@@ -248,7 +248,7 @@ int MXLoadLib(const char *path) {
       std::vector<uint32_t*> out_shapes(out_shape->size());
       // determine amount of memory needed to store all the output shapes
       buff_size = 0;
-      for (unsigned i=0; i<out_shape->size(); i++) {
+      for (unsigned i = 0; i < out_shape->size(); i++) {
         buff_size += outdims[i];
       }
 
@@ -262,7 +262,7 @@ int MXLoadLib(const char *path) {
         }
       }
 
-      //assign output shapes to ShapeVector
+      // assign output shapes to ShapeVector
       for (unsigned i = 0; i < out_shape->size(); ++i) {
         SHAPE_ASSIGN_CHECK(*out_shape, i,
                            mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
@@ -274,10 +274,10 @@ int MXLoadLib(const char *path) {
         callFree(outshapes[i]);
       }
       callFree(outshapes);
-      
+
       return true;
     };
-    
+
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -290,7 +290,7 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       std::vector<void*> in_data, out_data;
       std::vector<const int64_t *> in_shapes, out_shapes;
       std::vector<int> in_dims, out_dims;
@@ -330,9 +330,9 @@ int MXLoadLib(const char *path) {
     regOp.set_num_outputs(num_outputs);
     regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
     regOp.add_argument("data", "NDArray[]", "Source inputs");
-    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);      
+    regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
   }
-  
+
   API_END();
 }
 

From cf9350d8dd5bbf331d6e9b4c31cd0a9c89bb02e7 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 08:04:35 +0000
Subject: [PATCH 010/111] fixed whitespace

---
 python/mxnet/library.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index fcf39815d8e9..8ea0bc2ae0a5 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -59,12 +59,12 @@ def load(path):
     mx_nd = sys.modules["mxnet.ndarray"]
     mx_nd_op = sys.modules["mxnet.ndarray.op"]
     for op in dir(mx_nd_op):
-        func = getattr(mx_nd_op,op)
-        setattr(mx_nd,op,func)
+        func = getattr(mx_nd_op, op)
+        setattr(mx_nd, op, func)
 
     #re-register mx.sym.op into mx.sym
     mx_sym = sys.modules["mxnet.symbol"]
     mx_sym_op = sys.modules["mxnet.symbol.op"]
     for op in dir(mx_sym_op):
-        func = getattr(mx_sym_op,op)
-        setattr(mx_sym,op,func)
+        func = getattr(mx_sym_op, op)
+        setattr(mx_sym, op, func)

From 38e77a582f97936f7c43f29072c6a94398f8580f Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 23 Aug 2019 19:10:53 +0000
Subject: [PATCH 011/111] added temporary support for operator
 multi-registration

---
 src/c_api/c_api.cc | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ee3c0dbc90a8..cb044fce294c 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -323,14 +323,35 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
-    // re-register op in MXNet using lambda converter functions
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-    regOp.set_attr_parser(attr_parser);
-    regOp.set_num_inputs(num_inputs);
-    regOp.set_num_outputs(num_outputs);
-    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-    regOp.add_argument("data", "NDArray[]", "Source inputs");
-    regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+    //check if operator is already registered
+    const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
+    if(regOpPtr == nullptr) {
+      // re-register op in MXNet using lambda converter functions
+      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+      regOp.set_attr_parser(attr_parser);
+      regOp.set_num_inputs(num_inputs);
+      regOp.set_num_outputs(num_outputs);
+
+      regOp.add_argument("data", "NDArray[]", "Source inputs");
+
+      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
+      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+    } else {
+      //overwrite registration of existing op with custom op
+      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+
+      regOp.set_attr_parser(attr_parser);
+      regOp.set_num_inputs(num_inputs);
+      regOp.set_num_outputs(num_outputs);
+
+      regOp.arguments.clear();
+      regOp.add_argument("data", "NDArray[]", "Source inputs");
+
+      //set attribute with higher plevel (11) to allow re-registering once
+      //TODO: enable constant overwriting of registertion multiple times
+      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
+      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
+    }
   }
 
   API_END();

From 7b8f6a2fcd6b68671d20b7fa940c39f19f2bf4a2 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 23 Aug 2019 19:17:38 +0000
Subject: [PATCH 012/111] insanity checked

---
 src/c_api/c_api.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index cb044fce294c..7f54c30e34c2 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -325,7 +325,7 @@ int MXLoadLib(const char *path) {
 
     //check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
-    if(regOpPtr == nullptr) {
+    if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
       regOp.set_attr_parser(attr_parser);
@@ -337,7 +337,7 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
     } else {
-      //overwrite registration of existing op with custom op
+      // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
 
       regOp.set_attr_parser(attr_parser);
@@ -347,8 +347,8 @@ int MXLoadLib(const char *path) {
       regOp.arguments.clear();
       regOp.add_argument("data", "NDArray[]", "Source inputs");
 
-      //set attribute with higher plevel (11) to allow re-registering once
-      //TODO: enable constant overwriting of registertion multiple times
+      // set attribute with higher plevel (11) to allow re-registering once
+      // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
     }

From 5b817bd5815e54676052b2b5a01eaa278460f731 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 20:06:04 +0000
Subject: [PATCH 013/111] update docblocks

---
 example/lib_ops/libtest.cc |  2 +-
 example/lib_ops/mylib.cc   |  5 +++--
 example/lib_ops/test.py    |  1 +
 include/mxnet/lib_api.h    |  5 ++++-
 python/mxnet/__init__.py   |  1 +
 src/c_api/c_api.cc         | 10 ++++++++--
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/example/lib_ops/libtest.cc b/example/lib_ops/libtest.cc
index 8bdf36c05d37..9fcdda55c64f 100644
--- a/example/lib_ops/libtest.cc
+++ b/example/lib_ops/libtest.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file libtest.cc
  * \brief This test checks if the library is implemented correctly
  * and does not involve dynamic loading of library into MXNet
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 5916c9d46683..f00b138c66dc 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -18,9 +18,10 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file mylib.cc
- * \brief Sample library file
+ * \brief Sample custom operator implementation
+ * library file
  */
 
 #include <iostream>
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index cdf78bc5c9c3..b9d5aeb49340 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -21,6 +21,7 @@
 # pylint: disable=arguments-differ
 
 # This test checks if dynamic loading of library into MXNet is successful
+# and checks the end of end computation of custom operator
 
 import mxnet as mx
 import os
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 0d5fa849d431..c36868adf1f8 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -18,10 +18,13 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file lib_api.h
  * \brief APIs to interact with libraries
+ * This API specifies function prototypes to
+ * register custom ops for library authors
  */
+
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index c76d78f54331..87beb23a8d2b 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -86,6 +86,7 @@
 
 from . import gluon
 
+# Dynamic library module should be done after ndarray and symbol are initialized
 from . import library
 
 __version__ = base.__version__
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7f54c30e34c2..88f4fd0cdea2 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -93,7 +93,10 @@ inline int MXAPIGetFunctionRegInfo(const FunRegType *e,
 
 // NOTE: return value is added in API_END
 
-// Loads library and initializes it
+/*!
+ * \brief Loads dynamic library and initializes it
+ * \param path library path
+ */
 int MXLoadLib(const char *path) {
   API_BEGIN();
   void *lib = LibraryInitializer::Get()->lib_load(path);
@@ -122,7 +125,10 @@ int MXLoadLib(const char *path) {
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
-  // loop and register each operator in the library
+  /*
+   * The library has custom operators implementation
+   * loop and register each operator in the library to NNVM
+   */
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for (int i = 0; i < numOps; i++) {
     const char* name;

From 3bccfbe09832ac9b8276b93d4285fbf0d9f4385f Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 20:10:21 +0000
Subject: [PATCH 014/111] small format fix

---
 src/c_api/c_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 88f4fd0cdea2..198875736fc8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -329,7 +329,7 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
-    //check if operator is already registered
+    // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions

From a8c19c8424236dcb43ce4c653d8b1c332e8ab936 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 23:00:24 +0000
Subject: [PATCH 015/111] fix unittest with correct library

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index e3ccdac27943..90e64594589e 100644
--- a/Makefile
+++ b/Makefile
@@ -660,7 +660,7 @@ pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 sample_lib:
-	$(CXX) -shared -fPIC -std=c++11 example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/mylib.cc -o libsample_lib.so -I include/mxnet
 
 doc: docs
 

From 2f34471b49e5c7e6cf0f43b02ff5144c4bae6559 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Tue, 27 Aug 2019 02:53:21 +0000
Subject: [PATCH 016/111] implement InferType

---
 example/lib_ops/mylib.cc | 27 +++++++++++++++++++++------
 include/mxnet/lib_api.h  | 40 +++++++++++++++++++++++++++++++++++++++-
 src/c_api/c_api.cc       | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index f00b138c66dc..4098e38aa06c 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -83,25 +83,39 @@ int parseAttrs(std::map<std::string,std::string> attrs,
 
 int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
               std::vector<int> &outtypes) {
+  // validate inputs
+  if (intypes.size() != 2) {
+    std::cout << "Expected 2 inputs to inferType" << std::endl;
+    return 0;
+  }
+
+  if (intypes[0] != intypes[1]) {
+    std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
+    return 0;
+  }
+
   outtypes[0] = intypes[0];
-  
+
+  std::cout << "intypes[0]=" << intypes[0] << "  outtypes[0]=" << outtypes[0] << std::endl;
+  std::cout << "intypes=" << intypes.size() << "  outtypes=" << outtypes.size() << std::endl;
+
   return 1; //no error
 }
 
 int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
-  //validate inputs
-  if(inshapes.size() != 2) {
+  // validate inputs
+  if (inshapes.size() != 2) {
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
     return 0;
   }
 
-  if(inshapes[0].size() != 2) {
+  if (inshapes[0].size() != 2) {
     std::cout << "Expected 2D for first input to inferShape" << std::endl;
     return 0;
   }
 
-  if(inshapes[1].size() != 2) {
+  if (inshapes[1].size() != 2) {
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
     return 0;
   }
@@ -114,7 +128,8 @@ int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<
   std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
   
-  if(k != kk) return 0;
+  if (k != kk)
+    return 0;
   
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c36868adf1f8..24ec22fc42fe 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -207,6 +207,10 @@ typedef int (*opCallInferShape_t)(inferShape_t, const char* const*, const char*
                                   unsigned int**, int*, int,
                                   unsigned int***, int**, int);
 
+#define MXLIB_OPCALLINFERTYPE_STR "_opCallInferType"
+typedef int (*opCallInferType_t)(inferType_t, const char* const*, const char* const*, int,
+                                  int*, int, int*, int);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -284,7 +288,8 @@ extern "C" {
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
     int retval = inferShape(attrs, in_shapes, out_shapes);
-    if (!retval) return retval;
+    if (!retval)
+      return retval;
 
     // allocate space for output dims, shape
     *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
@@ -302,6 +307,39 @@ extern "C" {
     return retval;
   }
 
+  /*!
+   * \brief returns status of calling InferType function for operator from library
+   */
+  int _opCallInferType(inferType_t inferType, const char* const* keys,
+                        const char* const* vals, int num,
+                        int* intypes, int num_in, int* outtypes, int num_out) {
+    //create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of types for inputs
+    std::vector<int> in_types(num_in);
+    for (int i = 0; i < num_in; i++) {
+      in_types[i] = intypes[i];
+    }
+
+    // create a vector of types for outputs
+    std::vector<int> out_types(num_out);
+
+    int retval = inferType(attrs, in_types, out_types);
+    if (!retval)
+      return retval;
+
+    // copy output types
+    for (int i = 0; i < num_out; i++) {
+      outtypes[i] = out_types[i];
+    }
+
+    return retval;
+  }
+
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 198875736fc8..ad776ad7d539 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -117,6 +117,9 @@ int MXLoadLib(const char *path) {
   opCallInferShape_t callInferShape =
     get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
 
+  opCallInferType_t callInferType =
+    get_func<opCallInferType_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERTYPE_STR));
+
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
@@ -137,10 +140,10 @@ int MXLoadLib(const char *path) {
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
 
-    // get operator from the library
+    // get custom operator implemenation from the dynamic library
     opRegGet(i, &name, &fcomp, &parse, &type, &shape);
 
-    // validate operator functions from the library
+    // validate custom operator functions from the dynamic library
     CHECK(fcomp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name
@@ -284,6 +287,36 @@ int MXLoadLib(const char *path) {
       return true;
     };
 
+    // lambda function to call infer type
+    auto infer_type = [=] (const nnvm::NodeAttrs& attrs,
+                            std::vector<int> *in_type,
+                            std::vector<int> *out_type) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      // copy input types from in_type
+      std::vector<int> intypes(*in_type);
+
+      // output types will be populated by inferType function
+      std::vector<int> outtypes(out_type->size());
+
+      CHECK(callInferType(type, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           intypes.data(), in_type->size(),
+                           outtypes.data(), out_type->size()))
+      << "Error calling InferType for custom operator '" << name_str << "'";
+
+      // copy and assign output types from custom op to MXNet memory
+      for (size_t i = 0; i < out_type->size(); i++) {
+        TYPE_ASSIGN_CHECK(*out_type, i, outtypes[i]);
+      }
+
+      return true;
+    };
+
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -339,7 +372,7 @@ int MXLoadLib(const char *path) {
       regOp.set_num_outputs(num_outputs);
 
       regOp.add_argument("data", "NDArray[]", "Source inputs");
-
+      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
     } else {
@@ -355,6 +388,7 @@ int MXLoadLib(const char *path) {
 
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
+      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
     }

From 592249a518a096e38ca8d676c4bd2880cb679ee0 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 27 Aug 2019 18:46:21 +0000
Subject: [PATCH 017/111] initial support for resource manager, temp space

---
 example/lib_ops/mylib.cc |  5 +++--
 include/mxnet/lib_api.h  | 38 +++++++++++++++++++++++++++++++++-----
 src/c_api/c_api.cc       | 27 +++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 4098e38aa06c..fea5b84e30a6 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -44,7 +44,8 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 
 
 int myFCompute(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
@@ -52,7 +53,7 @@ int myFCompute(std::map<std::string,std::string> attrs,
       return 0;
     }
   }
-
+  
   //extract data pointers from tensors
   float* input1 = inputs[0].getData<float>();
   float* input2 = inputs[1].getData<float>();
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 24ec22fc42fe..b05f3ab936d1 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -67,11 +67,35 @@ struct MXTensor {
   MXDType dtype;
 };
 
+/*!
+ * \brief resource malloc function to allocate memory inside fcompute function
+ */
+typedef void* (*xpu_malloc_t)(void*,int);
+
+/*!
+ * \brief Class to provide resource APIs to FCompute
+ */
+class OpResource {
+ public:
+  OpResource(xpu_malloc_t xm, void* _xm) : xpu_malloc(xm), _xpu_malloc(_xm) {}
+
+  /*!
+   * \brief allocate memory controlled by MXNet
+   */
+  void* alloc(int size) {
+    return xpu_malloc(_xpu_malloc,size);
+  }
+ private:
+  xpu_malloc_t xpu_malloc;
+  void* _xpu_malloc;
+};
+
 /*!
  * Custom Operator function templates
  */
 typedef int (*fcomp_t)(std::map<std::string, std::string>,
-                       std::vector<MXTensor>, std::vector<MXTensor>);
+                       std::vector<MXTensor>, std::vector<MXTensor>,
+                       OpResource res);
 typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
                             int*, int*);
 typedef int (*inferType_t)(std::map<std::string, std::string>,
@@ -214,7 +238,8 @@ typedef int (*opCallInferType_t)(inferType_t, const char* const*, const char* co
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
-                             const int64_t**, int*, void**, int*, int);
+                             const int64_t**, int*, void**, int*, int,
+                             xpu_malloc_t, void*);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
@@ -247,7 +272,7 @@ extern "C" {
   void _opCallFree(void* ptr) {
     free(ptr);
   }
-
+  
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
@@ -348,7 +373,8 @@ extern "C" {
                       const int64_t** inshapes, int* indims,
                       void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims,
-                      void** outdata, int* outtypes, int num_out) {
+                      void** outdata, int* outtypes, int num_out,
+                      xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -375,7 +401,9 @@ extern "C" {
       }
     }
 
-    return fcomp(attrs, inputs, outputs);
+    OpResource res(xpu_malloc,_xpu_malloc);
+    
+    return fcomp(attrs, inputs, outputs, res);
   }
 
   /*!
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ad776ad7d539..79be8ece479f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -351,12 +351,32 @@ int MXLoadLib(const char *path) {
         out_types.push_back(outputs[i].type_flag_);
       }
 
+      // get memory resource
+      const Resource &resource = ctx.requested[0];
+      mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
+
+      // create lambda that captures stream & resource objects
+      auto cpu_alloc = [&](int size) {
+        mshadow::Tensor<mxnet::cpu, 1, char> data = resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size),cpu_stream);
+        return data.dptr_;
+      };
+
+      // create lambda without captures so that we can cast it to function pointer
+      // this needs to be a lambda function so that we can do the decltype cast
+      auto cpu_malloc = [](void* _cpu_alloc, int size) {
+        // cast the void* argument to the type for the cpu_alloc lambda function
+        decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
+        
+        void* ptr = (*cpualloc)(size);
+        return ptr;
+      };
+
       // call fcompute function
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(),
                       in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(),
-                      out_types.data(), out_data.size()))
+                      out_types.data(), out_data.size(), cpu_malloc, &cpu_alloc))
       << "Error calling FCompute for custom operator '" << name_str << "'";
 
       // return type void
@@ -370,7 +390,10 @@ int MXLoadLib(const char *path) {
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
-
+      regOp.set_attr<FResourceRequest>("FResourceRequest",
+                                       [](const NodeAttrs& attrs) {
+                                         return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                       });
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);

From 3186d6041276718d81927fc70092a5b17813f164 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 27 Aug 2019 20:13:14 +0000
Subject: [PATCH 018/111] fixed formatting

---
 include/mxnet/lib_api.h | 12 ++++++------
 src/c_api/c_api.cc      |  8 +++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b05f3ab936d1..a0ddd56f03fd 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -70,7 +70,7 @@ struct MXTensor {
 /*!
  * \brief resource malloc function to allocate memory inside fcompute function
  */
-typedef void* (*xpu_malloc_t)(void*,int);
+typedef void* (*xpu_malloc_t)(void*, int);
 
 /*!
  * \brief Class to provide resource APIs to FCompute
@@ -83,7 +83,7 @@ class OpResource {
    * \brief allocate memory controlled by MXNet
    */
   void* alloc(int size) {
-    return xpu_malloc(_xpu_malloc,size);
+    return xpu_malloc(_xpu_malloc, size);
   }
  private:
   xpu_malloc_t xpu_malloc;
@@ -272,7 +272,7 @@ extern "C" {
   void _opCallFree(void* ptr) {
     free(ptr);
   }
-  
+
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
@@ -338,7 +338,7 @@ extern "C" {
   int _opCallInferType(inferType_t inferType, const char* const* keys,
                         const char* const* vals, int num,
                         int* intypes, int num_in, int* outtypes, int num_out) {
-    //create map of attributes from list
+    // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
@@ -401,8 +401,8 @@ extern "C" {
       }
     }
 
-    OpResource res(xpu_malloc,_xpu_malloc);
-    
+    OpResource res(xpu_malloc, _xpu_malloc);
+
     return fcomp(attrs, inputs, outputs, res);
   }
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 79be8ece479f..7a74673f1d67 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -357,7 +357,8 @@ int MXLoadLib(const char *path) {
 
       // create lambda that captures stream & resource objects
       auto cpu_alloc = [&](int size) {
-        mshadow::Tensor<mxnet::cpu, 1, char> data = resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size),cpu_stream);
+        mshadow::Tensor<mxnet::cpu, 1, char> data =
+        resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
         return data.dptr_;
       };
 
@@ -366,7 +367,7 @@ int MXLoadLib(const char *path) {
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
         decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
-        
+
         void* ptr = (*cpualloc)(size);
         return ptr;
       };
@@ -392,7 +393,8 @@ int MXLoadLib(const char *path) {
       regOp.set_num_outputs(num_outputs);
       regOp.set_attr<FResourceRequest>("FResourceRequest",
                                        [](const NodeAttrs& attrs) {
-                                         return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                         return std::vector<ResourceRequest>{
+                                           ResourceRequest::kTempSpace};
                                        });
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);

From bf549b4b68e2d709b5ae7aa8487e032bea1bb482 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 05:56:35 +0000
Subject: [PATCH 019/111] changed decltype to typedef

---
 src/c_api/c_api.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 382f9b47a84f..56081aa5442e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -362,11 +362,13 @@ int MXLoadLib(const char *path) {
         return data.dptr_;
       };
 
+      typedef decltype(cpu_alloc) alloc_type;
+      
       // create lambda without captures so that we can cast it to function pointer
       // this needs to be a lambda function so that we can do the decltype cast
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
-        decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
+        alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
 
         void* ptr = (*cpualloc)(size);
         return ptr;

From 439ee201f734af241e5ec22f83569570bd287914 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 06:01:20 +0000
Subject: [PATCH 020/111] fixed whitespace

---
 src/c_api/c_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 56081aa5442e..69d9ba114b0b 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -363,7 +363,7 @@ int MXLoadLib(const char *path) {
       };
 
       typedef decltype(cpu_alloc) alloc_type;
-      
+
       // create lambda without captures so that we can cast it to function pointer
       // this needs to be a lambda function so that we can do the decltype cast
       auto cpu_malloc = [](void* _cpu_alloc, int size) {

From bba25db59e840183b69ea4688c04a8e9335516bd Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 07:39:17 +0000
Subject: [PATCH 021/111] Added windows declaration types, change APIs to
 return MXReturnValue instead of int

---
 example/lib_ops/mylib.cc | 36 +++++++++----------
 include/mxnet/lib_api.h  | 77 ++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index fea5b84e30a6..ac2bd974a94d 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -43,14 +43,14 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 }
 
 
-int myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
       std::cout << "Expected input " << i << " to have float32 type" << std::endl;
-      return 0;
+      return MX_FAIL;
     }
   }
   
@@ -65,10 +65,10 @@ int myFCompute(std::map<std::string,std::string> attrs,
 
   gemm(input1, input2, output, n, k, m);
   
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int parseAttrs(std::map<std::string,std::string> attrs,
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   /*
   if(attrs.find("myParam") == attrs.end()) {
@@ -79,20 +79,20 @@ int parseAttrs(std::map<std::string,std::string> attrs,
   *num_in = 2;
   *num_out = 1;
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
               std::vector<int> &outtypes) {
   // validate inputs
   if (intypes.size() != 2) {
     std::cout << "Expected 2 inputs to inferType" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (intypes[0] != intypes[1]) {
     std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   outtypes[0] = intypes[0];
@@ -100,25 +100,25 @@ int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes
   std::cout << "intypes[0]=" << intypes[0] << "  outtypes[0]=" << outtypes[0] << std::endl;
   std::cout << "intypes=" << intypes.size() << "  outtypes=" << outtypes.size() << std::endl;
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
   // validate inputs
   if (inshapes.size() != 2) {
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (inshapes[0].size() != 2) {
     std::cout << "Expected 2D for first input to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (inshapes[1].size() != 2) {
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
   
   unsigned n = inshapes[0][0];
@@ -130,12 +130,12 @@ int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
   
   if (k != kk)
-    return 0;
+    return MX_FAIL;
   
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
 REGISTER_OP(sam)
@@ -144,13 +144,13 @@ REGISTER_OP(sam)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
-int initialize(int version) {
+MXReturnValue initialize(int version) {
   if (version >= 10400) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
-    return 1;
+    return MX_SUCCESS;
   } else {
     std::cout << "MXNet version " << version << " not supported" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 }
 
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a0ddd56f03fd..c8fa635fa218 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -90,19 +90,24 @@ class OpResource {
   void* _xpu_malloc;
 };
 
+enum MXReturnValue {
+  MX_FAIL = 0,
+  MX_SUCCESS = 1,
+};
+
 /*!
  * Custom Operator function templates
  */
-typedef int (*fcomp_t)(std::map<std::string, std::string>,
-                       std::vector<MXTensor>, std::vector<MXTensor>,
-                       OpResource res);
-typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
-                            int*, int*);
-typedef int (*inferType_t)(std::map<std::string, std::string>,
-                           std::vector<int>&, std::vector<int>&);
-typedef int (*inferShape_t)(std::map<std::string, std::string>,
-                            std::vector<std::vector<unsigned int>>&,
-                            std::vector<std::vector<unsigned int>>&);
+typedef MXReturnValue (*fcomp_t)(std::map<std::string, std::string>,
+                                 std::vector<MXTensor>, std::vector<MXTensor>,
+                                 OpResource res);
+typedef MXReturnValue (*parseAttrs_t)(std::map<std::string, std::string>,
+                                      int*, int*);
+typedef MXReturnValue (*inferType_t)(std::map<std::string, std::string>,
+                                     std::vector<int>&, std::vector<int>&);
+typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
+                                      std::vector<std::vector<unsigned int>>&,
+                                      std::vector<std::vector<unsigned int>>&);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -248,14 +253,24 @@ extern "C" {
   /*!
    * \brief returns number of ops registered in this library
    */
-  int _opRegSize() {
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opRegSize
+#else
+  int _opRegSize
+#endif
+  () {
     return Registry<CustomOp>::get()->size();
   }
 
   /*!
    * \brief returns operator registration at specified index
    */
-  void _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) void __cdecl _opRegGet
+#else
+    void _opRegGet
+#endif
+    (int idx, const char** name, fcomp_t* fcomp,
                  parseAttrs_t* parse, inferType_t* type,
                  inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
@@ -269,14 +284,24 @@ extern "C" {
   /*!
    * \brief calls free from the external library for library allocated arrays
    */
-  void _opCallFree(void* ptr) {
+  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) void __cdecl _opCallFree
+#else
+  void _opCallFree
+#endif
+  (void* ptr) {
     free(ptr);
   }
 
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
-  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallParseAttrs
+#else
+  int _opCallParseAttrs
+#endif
+  (parseAttrs_t parseAttrs, const char* const* keys,
                         const char* const* vals, int num,
                         int* num_in, int* num_out) {
     // create map of attributes from list
@@ -291,7 +316,12 @@ extern "C" {
   /*!
    * \brief returns status of calling infer shape function for operator from library
    */
-  int _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallInferShape
+#else
+  int _opCallInferShape
+#endif
+  (inferShape_t inferShape, const char* const* keys,
                         const char* const* vals, int num,
                         unsigned int** inshapes, int* indims, int num_in,
                         unsigned int*** outshapes, int** outdims, int num_out) {
@@ -335,7 +365,12 @@ extern "C" {
   /*!
    * \brief returns status of calling InferType function for operator from library
    */
-  int _opCallInferType(inferType_t inferType, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallInferType
+#else
+  int _opCallInferType
+#endif
+  (inferType_t inferType, const char* const* keys,
                         const char* const* vals, int num,
                         int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
@@ -368,7 +403,13 @@ extern "C" {
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
-  int _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallFCompute
+#else
+  int _opCallFCompute
+#endif
+  (fcomp_t fcomp, const char* const* keys,
                       const char* const* vals, int num,
                       const int64_t** inshapes, int* indims,
                       void** indata, int* intypes, int num_in,
@@ -416,7 +457,7 @@ extern "C" {
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl initialize(int);
 #else
-  int initialize(int);
+  MXReturnValue initialize(int);
 #endif
 }
 #endif  // MXNET_LIB_API_H_

From a681f6181141214203cb05b2cca90d4809a70785 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 07:59:52 +0000
Subject: [PATCH 022/111] added library version number, API to get, and check
 to validate

---
 include/mxnet/lib_api.h | 94 ++++++++++++++++++++++++-----------------
 src/c_api/c_api.cc      |  7 +++
 2 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c8fa635fa218..4801282ee13b 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 
+#define MX_LIBRARY_VERSION 1
+
 /*!
  * \brief External Tensor data types
  */
@@ -249,16 +251,31 @@ typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, in
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
+#define MXLIB_OPVERSION_STR "_opVersion"
+typedef int (*opVersion_t)();
+
 extern "C" {
+  /*!
+   * \brief returns MXNet library version 
+   */
+  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opVersion () {
+    return MX_LIBRARY_VERSION;
+  }
+
   /*!
    * \brief returns number of ops registered in this library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opRegSize
+  __declspec(dllexport) int __cdecl
 #else
-  int _opRegSize
+  int
 #endif
-  () {
+  _opRegSize () {
     return Registry<CustomOp>::get()->size();
   }
 
@@ -266,13 +283,13 @@ extern "C" {
    * \brief returns operator registration at specified index
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) void __cdecl _opRegGet
+  __declspec(dllexport) void __cdecl
 #else
-    void _opRegGet
+  void
 #endif
-    (int idx, const char** name, fcomp_t* fcomp,
-                 parseAttrs_t* parse, inferType_t* type,
-                 inferShape_t* shape) {
+  _opRegGet (int idx, const char** name, fcomp_t* fcomp,
+             parseAttrs_t* parse, inferType_t* type,
+             inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -285,11 +302,11 @@ extern "C" {
    * \brief calls free from the external library for library allocated arrays
    */
   #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) void __cdecl _opCallFree
+  __declspec(dllexport) void __cdecl
 #else
-  void _opCallFree
+  void
 #endif
-  (void* ptr) {
+  _opCallFree (void* ptr) {
     free(ptr);
   }
 
@@ -297,13 +314,13 @@ extern "C" {
    * \brief returns status of calling parse attributes function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallParseAttrs
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallParseAttrs
+  int
 #endif
-  (parseAttrs_t parseAttrs, const char* const* keys,
-                        const char* const* vals, int num,
-                        int* num_in, int* num_out) {
+  _opCallParseAttrs (parseAttrs_t parseAttrs, const char* const* keys,
+                     const char* const* vals, int num,
+                     int* num_in, int* num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -317,14 +334,14 @@ extern "C" {
    * \brief returns status of calling infer shape function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallInferShape
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallInferShape
+  int
 #endif
-  (inferShape_t inferShape, const char* const* keys,
-                        const char* const* vals, int num,
-                        unsigned int** inshapes, int* indims, int num_in,
-                        unsigned int*** outshapes, int** outdims, int num_out) {
+  _opCallInferShape (inferShape_t inferShape, const char* const* keys,
+                     const char* const* vals, int num,
+                     unsigned int** inshapes, int* indims, int num_in,
+                     unsigned int*** outshapes, int** outdims, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -366,13 +383,13 @@ extern "C" {
    * \brief returns status of calling InferType function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallInferType
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallInferType
+  int
 #endif
-  (inferType_t inferType, const char* const* keys,
-                        const char* const* vals, int num,
-                        int* intypes, int num_in, int* outtypes, int num_out) {
+  _opCallInferType (inferType_t inferType, const char* const* keys,
+                    const char* const* vals, int num,
+                    int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -405,17 +422,17 @@ extern "C" {
    */
 
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallFCompute
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallFCompute
+  int
 #endif
-  (fcomp_t fcomp, const char* const* keys,
-                      const char* const* vals, int num,
-                      const int64_t** inshapes, int* indims,
-                      void** indata, int* intypes, int num_in,
-                      const int64_t** outshapes, int* outdims,
-                      void** outdata, int* outtypes, int num_out,
-                      xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
+  _opCallFCompute (fcomp_t fcomp, const char* const* keys,
+                   const char* const* vals, int num,
+                   const int64_t** inshapes, int* indims,
+                   void** indata, int* intypes, int num_in,
+                   const int64_t** outshapes, int* outdims,
+                   void** outdata, int* outtypes, int num_out,
+                   xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -455,9 +472,10 @@ extern "C" {
    * \return Non-zero value on error i.e. library incompatible with passed MXNet version
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl initialize(int);
+  __declspec(dllexport) MXReturnValue __cdecl
 #else
-  MXReturnValue initialize(int);
+  MXReturnValue
 #endif
+  initialize(int version);
 }
 #endif  // MXNET_LIB_API_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 69d9ba114b0b..2bacee2663ee 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -103,6 +103,13 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
+  // check that library and MXNet use same version of library API
+  opVersion_t opVersion = get_func<opVersion_t>(lib, const_cast<char*>(MXLIB_OPVERSION_STR));
+  int libVersion =  opVersion();
+  if (MX_LIBRARY_VERSION != libVersion)
+    LOG(FATAL) << "Library version (" << libVersion << ") does not match MXNet version ("
+               << MX_LIBRARY_VERSION << ")";
+
   // initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))

From 711f9a364d463018395f84059edb8e2b3eabb052 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 08:15:31 +0000
Subject: [PATCH 023/111] Changed CMakeLists to build lib_ops instead of
 lib_api, updated lib_api example, fixed whitespace

---
 CMakeLists.txt           |  2 +-
 example/lib_api/Makefile |  2 +-
 example/lib_api/mylib.cc |  6 ++---
 include/mxnet/lib_api.h  | 48 ++++++++++++++++++++--------------------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 976c736f5f35..564e40c1083a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -693,7 +693,7 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_api/mylib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_ops/mylib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
diff --git a/example/lib_api/Makefile b/example/lib_api/Makefile
index e5893c8065c4..a811f2250b3e 100644
--- a/example/lib_api/Makefile
+++ b/example/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
+	g++ -std=c++11 -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
diff --git a/example/lib_api/mylib.cc b/example/lib_api/mylib.cc
index e67560a87f3d..048642332f16 100644
--- a/example/lib_api/mylib.cc
+++ b/example/lib_api/mylib.cc
@@ -26,12 +26,12 @@
 #include <iostream>
 #include "lib_api.h"
 
-int initialize(int version) {
+MXReturnValue initialize(int version) {
   if (version >= 10400) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
-    return 1;
+    return MX_SUCCESS;
   } else {
     std::cout << "MXNet version " << version << " not supported" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 }
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 4801282ee13b..f0256cc616c8 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -263,7 +263,7 @@ extern "C" {
 #else
   int
 #endif
-  _opVersion () {
+  _opVersion() {
     return MX_LIBRARY_VERSION;
   }
 
@@ -275,7 +275,7 @@ extern "C" {
 #else
   int
 #endif
-  _opRegSize () {
+  _opRegSize() {
     return Registry<CustomOp>::get()->size();
   }
 
@@ -287,9 +287,9 @@ extern "C" {
 #else
   void
 #endif
-  _opRegGet (int idx, const char** name, fcomp_t* fcomp,
-             parseAttrs_t* parse, inferType_t* type,
-             inferShape_t* shape) {
+  _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+            parseAttrs_t* parse, inferType_t* type,
+            inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -306,7 +306,7 @@ extern "C" {
 #else
   void
 #endif
-  _opCallFree (void* ptr) {
+  _opCallFree(void* ptr) {
     free(ptr);
   }
 
@@ -318,9 +318,9 @@ extern "C" {
 #else
   int
 #endif
-  _opCallParseAttrs (parseAttrs_t parseAttrs, const char* const* keys,
-                     const char* const* vals, int num,
-                     int* num_in, int* num_out) {
+  _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+                    const char* const* vals, int num,
+                    int* num_in, int* num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -338,10 +338,10 @@ extern "C" {
 #else
   int
 #endif
-  _opCallInferShape (inferShape_t inferShape, const char* const* keys,
-                     const char* const* vals, int num,
-                     unsigned int** inshapes, int* indims, int num_in,
-                     unsigned int*** outshapes, int** outdims, int num_out) {
+  _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+                    const char* const* vals, int num,
+                    unsigned int** inshapes, int* indims, int num_in,
+                    unsigned int*** outshapes, int** outdims, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -387,9 +387,9 @@ extern "C" {
 #else
   int
 #endif
-  _opCallInferType (inferType_t inferType, const char* const* keys,
-                    const char* const* vals, int num,
-                    int* intypes, int num_in, int* outtypes, int num_out) {
+  _opCallInferType(inferType_t inferType, const char* const* keys,
+                   const char* const* vals, int num,
+                   int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -403,7 +403,7 @@ extern "C" {
     }
 
     // create a vector of types for outputs
-    std::vector<int> out_types(num_out);
+    std::vector<int> out_types(num_out, -1);
 
     int retval = inferType(attrs, in_types, out_types);
     if (!retval)
@@ -426,13 +426,13 @@ extern "C" {
 #else
   int
 #endif
-  _opCallFCompute (fcomp_t fcomp, const char* const* keys,
-                   const char* const* vals, int num,
-                   const int64_t** inshapes, int* indims,
-                   void** indata, int* intypes, int num_in,
-                   const int64_t** outshapes, int* outdims,
-                   void** outdata, int* outtypes, int num_out,
-                   xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
+  _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+                  const char* const* vals, int num,
+                  const int64_t** inshapes, int* indims,
+                  void** indata, int* intypes, int num_in,
+                  const int64_t** outshapes, int* outdims,
+                  void** outdata, int* outtypes, int num_out,
+                  xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {

From 5af1736ad669c291cecf32326f51d82142fbb7e8 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Thu, 29 Aug 2019 22:26:35 +0000
Subject: [PATCH 024/111] add prototype of subgraph op

---
 example/lib_ops/Makefile         |  7 +++-
 example/lib_ops/mylib.cc         |  8 +++-
 example/lib_ops/subgraph_lib.cc  | 72 ++++++++++++++++++++++++++++++++
 example/lib_ops/test.py          | 11 ++++-
 example/lib_ops/test_subgraph.py | 50 ++++++++++++++++++++++
 5 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 example/lib_ops/subgraph_lib.cc
 create mode 100644 example/lib_ops/test_subgraph.py

diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
index f649a68eee9a..628f09aadc4c 100644
--- a/example/lib_ops/Makefile
+++ b/example/lib_ops/Makefile
@@ -15,9 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-all:
+all: warpctc_lib subgraph_lib
+
+warpctc_lib:
 	g++ -shared -fPIC -std=gnu++0x mylib.cc -o mylib.so -I ../../include/mxnet
 
+subgraph_lib:
+	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
+
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
 
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index ac2bd974a94d..685ca4ddb2d4 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -138,7 +138,13 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-REGISTER_OP(sam)
+REGISTER_OP(gemm)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+REGISTER_OP(warpctc)
 .setFCompute(myFCompute)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
new file mode 100644
index 000000000000..e836659bf8ce
--- /dev/null
+++ b/example/lib_ops/subgraph_lib.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file subgraph_lib.cc
+ * \brief subgraph operator implementation
+ * library file
+ */
+
+#include <iostream>
+#include "lib_api.h"
+
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
+               int* num_in, int* num_out) {
+  *num_in = 2;
+  *num_out = 1;
+
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+              std::vector<int> &outtypes) {
+  outtypes[0] = intypes[0];
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+               std::vector<std::vector<unsigned int>> &outshapes) {
+  outshapes[0] = inshapes[0];
+  return MX_SUCCESS;
+}
+
+MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
+  outputs[0] = inputs[0];
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(subgraph_op)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}
+
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index b9d5aeb49340..d1027faa71bb 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -41,6 +41,15 @@
 #print inputs
 print(a)
 print(b)
+print('--------------')
 
 #compute and print output
-print(mx.nd.sam(a,b))
+print(mx.nd.gemm(a,b))
+
+# symbol api
+s = mx.sym.Variable('s')
+t = mx.sym.Variable('t')
+c = mx.sym.warpctc(s,t)
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+out = exe.forward()
+print(out)
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
new file mode 100644
index 000000000000..2791ebee2e9f
--- /dev/null
+++ b/example/lib_ops/test_subgraph.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+# and checks the end of end computation of custom operator
+
+import mxnet as mx
+import os
+
+# load library
+if (os.name=='posix'):
+    path = os.path.abspath('subgraph_lib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('subgraph_lib.so')
+    mx.library.load(path)
+
+# setup inputs to call test operator
+a = mx.nd.array([[1,2],[3,4]])
+b = mx.nd.array([[5,6],[7,8]])
+
+# imperative compute and print output
+print(mx.nd.subgraph_op(a,b))
+
+# symbolic compute
+s = mx.sym.Variable('s')
+t = mx.sym.Variable('t')
+c = mx.sym.subgraph_op(s,t)
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+out = exe.forward()
+print(out)

From 33d9cd762470882a92618acc20c74024198ea80d Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 30 Aug 2019 21:48:28 +0000
Subject: [PATCH 025/111] implement FMutateInput as optional attribute

---
 example/lib_ops/subgraph_lib.cc  | 49 ++++++++++++++++++++--
 example/lib_ops/test_subgraph.py |  2 +-
 include/mxnet/lib_api.h          | 71 +++++++++++++++++++++++++++-----
 src/c_api/c_api.cc               | 37 ++++++++++++++++-
 4 files changed, 142 insertions(+), 17 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index e836659bf8ce..ddea2aefb654 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -27,11 +27,22 @@
 #include <iostream>
 #include "lib_api.h"
 
+void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
+  unsigned i,j,kk;
+  for (i=0;i<n;i++) {
+    for (j=0;j<m;j++) {
+      C[i*m+j] = 0;
+      for (kk=0;kk<k;kk++) {
+        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
+      }
+    }
+  }
+}
+
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   *num_in = 2;
   *num_out = 1;
-
   return MX_SUCCESS;
 }
 
@@ -43,14 +54,43 @@ MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int
 
 MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
-  outshapes[0] = inshapes[0];
+  unsigned n = inshapes[0][0];
+  unsigned k = inshapes[0][1];
+  unsigned kk = inshapes[1][0];
+  unsigned m = inshapes[1][1];
+
+  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
+  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
+
+  if (k != kk)
+    return MX_FAIL;
+
+  outshapes[0].push_back(n);
+  outshapes[0].push_back(m);
   return MX_SUCCESS;
 }
 
 MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
-  outputs[0] = inputs[0];
+  //extract data pointers from tensors
+  float* input1 = inputs[0].getData<float>();
+  float* input2 = inputs[1].getData<float>();
+  float* output = outputs[0].getData<float>();
+  //set tensor shapes
+  unsigned n = inputs[0].shape[0];
+  unsigned k = inputs[0].shape[1];
+  unsigned m = inputs[1].shape[1];
+
+  gemm(input1, input2, output, n, k, m);
+
+  return MX_SUCCESS;
+}
+
+MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
+               std::vector<int> &input_indices) {
+  input_indices.push_back(1);
+  std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
   return MX_SUCCESS;
 }
 
@@ -58,7 +98,8 @@ REGISTER_OP(subgraph_op)
 .setFCompute(myFCompute)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
-.setInferShape(inferShape);
+.setInferShape(inferShape)
+.setMutateInputs(mutateInputs);
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index 2791ebee2e9f..279c6af6ca12 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -45,6 +45,6 @@
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
 c = mx.sym.subgraph_op(s,t)
-exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+exe = c.bind(ctx=mx.cpu(),args={'s':a},aux_states={'t':b})
 out = exe.forward()
 print(out)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index f0256cc616c8..bf6b7ae8d156 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -48,11 +48,17 @@ enum MXDType {
   kInt64 = 6,
 };
 
+enum MXReturnValue {
+  MX_FAIL = 0,
+  MX_SUCCESS = 1,
+};
+
 /*!
  * \brief External Tensor data structure
  */
 struct MXTensor {
-  MXTensor() { data = nullptr; }
+  MXTensor() : data(nullptr) {}
+
   MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
   : data{data}, shape{shape}, dtype{dtype} {}
 
@@ -92,11 +98,6 @@ class OpResource {
   void* _xpu_malloc;
 };
 
-enum MXReturnValue {
-  MX_FAIL = 0,
-  MX_SUCCESS = 1,
-};
-
 /*!
  * Custom Operator function templates
  */
@@ -110,6 +111,8 @@ typedef MXReturnValue (*inferType_t)(std::map<std::string, std::string>,
 typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
                                       std::vector<std::vector<unsigned int>>&,
                                       std::vector<std::vector<unsigned int>>&);
+typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
+                                      std::vector<int>&);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -117,7 +120,8 @@ typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
-    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
+    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
+    mutate_inputs(nullptr) {}
   ~CustomOp() {}
   CustomOp& setFCompute(fcomp_t fcomp) {
     fcompute = fcomp;
@@ -135,6 +139,10 @@ class CustomOp {
     infer_shape = func;
     return *this;
   }
+  CustomOp& setMutateInputs(mutateInputs_t func) {
+    mutate_inputs = func;
+    return *this;
+  }
   /*! \brief operator name */
   const char* name;
   /*! \brief operator functions */
@@ -142,6 +150,7 @@ class CustomOp {
   parseAttrs_t parse_attrs;
   inferType_t infer_type;
   inferShape_t infer_shape;
+  mutateInputs_t mutate_inputs;
 };
 
 /*!
@@ -210,21 +219,23 @@ class Registry {
 #define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
     Registry<CustomOp>::get()->add(TOSTRING(Name))
 
+/*
+ * -------------- BELOW FUNCTIONS ARE USED IN MXNET BACKEND ---------------
+ */
 
 /*!
- * \brief Following are the APIs implemented in the external library
+ * \brief Following are the C type APIs implemented in the external library
  * Each API has a #define string that is used to lookup the function in the library
  * Followed by the function declaration
  */
 
-
 #define MXLIB_OPREGSIZE_STR "_opRegSize"
 typedef int (*opRegSize_t)(void);
 
 #define MXLIB_OPREGGET_STR "_opRegGet"
 typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
-                          inferShape_t*);
+                          inferShape_t*, mutateInputs_t*);
 
 #define MXLIB_OPCALLFREE_STR "_opCallFree"
 typedef int (*opCallFree_t)(void*);
@@ -248,6 +259,10 @@ typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, in
                              const int64_t**, int*, void**, int*, int,
                              xpu_malloc_t, void*);
 
+#define MXLIB_OPCALLMUTATEINPUTS_STR "_opCallMutateInputs"
+typedef int (*opCallMutateInputs_t)(mutateInputs_t, const char* const*, const char* const*, int,
+                                    int**, int*);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -289,13 +304,14 @@ extern "C" {
 #endif
   _opRegGet(int idx, const char** name, fcomp_t* fcomp,
             parseAttrs_t* parse, inferType_t* type,
-            inferShape_t* shape) {
+            inferShape_t* shape, mutateInputs_t* mutate) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
     *parse = op.parse_attrs;
     *type = op.infer_type;
     *shape = op.infer_shape;
+    *mutate = op.mutate_inputs;
   }
 
   /*!
@@ -464,6 +480,39 @@ extern "C" {
     return fcomp(attrs, inputs, outputs, res);
   }
 
+  /*!
+   * \brief returns status of calling mutate inputs function for operator from library
+   */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opCallMutateInputs(mutateInputs_t mutate, const char* const* keys,
+                    const char* const* vals, int num,
+                    int** mutate_indices, int* indices_size) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of mutate input indices
+    std::vector<int> mut_ind;
+
+    int retval = mutate(attrs, mut_ind);
+    if (!retval)
+      return retval;
+
+    // output the input indices
+    *indices_size = mut_ind.size();
+    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+    for (int i = 0; i < *indices_size; i++) {
+      (*mutate_indices)[i] = mut_ind[i];
+    }
+
+    return retval;
+  }
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 2bacee2663ee..13de9664ed08 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -130,6 +130,9 @@ int MXLoadLib(const char *path) {
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
+  opCallMutateInputs_t callMutateInputs =
+    get_func<opCallMutateInputs_t>(lib, const_cast<char*>(MXLIB_OPCALLMUTATEINPUTS_STR));
+
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
@@ -142,13 +145,15 @@ int MXLoadLib(const char *path) {
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for (int i = 0; i < numOps; i++) {
     const char* name;
+    // function pointers holding implementation from custom library
     fcomp_t fcomp = nullptr;
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
+    mutateInputs_t mutate = nullptr; // optional
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp, &parse, &type, &shape);
+    opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
 
     // validate custom operator functions from the dynamic library
     CHECK(fcomp != nullptr) << "Error loading '" << name
@@ -392,6 +397,32 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
+    // lambda function to convert from external mutate_inputs to internal MXNet types
+    auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      // C type placeholder for mutate input indices vector
+      int* mutate_indices = nullptr;
+      int indices_size = 0;
+
+      // call mutate inputs function
+      CHECK(callMutateInputs(mutate, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                      &mutate_indices, &indices_size))
+      << "Error calling MutateInputs for custom operator '" << name_str << "'";
+
+      std::vector<uint32_t> mutate_indices_list(indices_size);
+      for (int i=0; i<indices_size; i++) {
+        mutate_indices_list[i] = static_cast<uint32_t>(mutate_indices[i]);
+      }
+
+      return mutate_indices_list;
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
@@ -409,6 +440,8 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+      if (mutate != nullptr)
+        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
     } else {
       // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
@@ -425,6 +458,8 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
+      if (mutate != nullptr)
+        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
     }
   }
 

From 4576570e6f82b4d0e9c606674e96df126b8e75f1 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 30 Aug 2019 22:10:02 +0000
Subject: [PATCH 026/111] fix sanity check

---
 src/c_api/c_api.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 13de9664ed08..a8da150b001c 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -150,7 +150,8 @@ int MXLoadLib(const char *path) {
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
-    mutateInputs_t mutate = nullptr; // optional
+    // optional attributes
+    mutateInputs_t mutate = nullptr;
 
     // get custom operator implemenation from the dynamic library
     opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
@@ -416,7 +417,7 @@ int MXLoadLib(const char *path) {
       << "Error calling MutateInputs for custom operator '" << name_str << "'";
 
       std::vector<uint32_t> mutate_indices_list(indices_size);
-      for (int i=0; i<indices_size; i++) {
+      for (int i=0; i < indices_size; i++) {
         mutate_indices_list[i] = static_cast<uint32_t>(mutate_indices[i]);
       }
 

From 6f3e3d983774974d7ce057574606c91645b214f4 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Tue, 3 Sep 2019 04:07:25 +0000
Subject: [PATCH 027/111] replace fcompute to fcomputeEx and implement simple
 finferstoragetype

---
 src/c_api/c_api.cc | 79 +++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index a8da150b001c..278b8fc61e79 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -115,7 +115,7 @@ int MXLoadLib(const char *path) {
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  // get call functions
+  // get C type interface functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
 
   opCallParseAttrs_t callParseAttrs =
@@ -146,24 +146,24 @@ int MXLoadLib(const char *path) {
   for (int i = 0; i < numOps; i++) {
     const char* name;
     // function pointers holding implementation from custom library
-    fcomp_t fcomp = nullptr;
-    parseAttrs_t parse = nullptr;
-    inferType_t type = nullptr;
-    inferShape_t shape = nullptr;
+    fcomp_t fcomp_fp = nullptr;
+    parseAttrs_t parse_fp = nullptr;
+    inferType_t type_fp = nullptr;
+    inferShape_t shape_fp = nullptr;
     // optional attributes
-    mutateInputs_t mutate = nullptr;
+    mutateInputs_t mutate_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
+    opRegGet(i, &name, &fcomp_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
 
     // validate custom operator functions from the dynamic library
-    CHECK(fcomp != nullptr) << "Error loading '" << name
+    CHECK(fcomp_fp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
-    CHECK(parse != nullptr) << "Error loading '" << name
+    CHECK(parse_fp != nullptr) << "Error loading '" << name
                             << "' custom op, ParseAttrs function was not set.";
-    CHECK(type  != nullptr) << "Error loading '" << name
+    CHECK(type_fp  != nullptr) << "Error loading '" << name
                             << "' custom op, InferType function was not set.";
-    CHECK(shape != nullptr) << "Error loading '" << name
+    CHECK(shape_fp != nullptr) << "Error loading '" << name
                             << "' custom op, InferShape function was not set.";
 
     LOG(INFO) << "\tOp[" << i << "] " << name;
@@ -186,7 +186,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs for custom operator '" << name_str << "'";
 
@@ -204,7 +204,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
 
@@ -222,7 +222,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
 
@@ -262,7 +262,7 @@ int MXLoadLib(const char *path) {
       uint32_t** outshapes = nullptr;
       int* outdims = nullptr;
 
-      CHECK(callInferShape(shape, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callInferShape(shape_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            inshapes.data(), indims.data(), in_shape->size(),
                            &outshapes, &outdims, out_shape->size()))
       << "Error calling InferShape for custom operator '" << name_str << "'";
@@ -317,7 +317,7 @@ int MXLoadLib(const char *path) {
       // output types will be populated by inferType function
       std::vector<int> outtypes(out_type->size());
 
-      CHECK(callInferType(type, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callInferType(type_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            intypes.data(), in_type->size(),
                            outtypes.data(), out_type->size()))
       << "Error calling InferType for custom operator '" << name_str << "'";
@@ -331,11 +331,11 @@ int MXLoadLib(const char *path) {
     };
 
     // lambda function to convert from external fcompute to internal MXNet types
-    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+    auto fcomp_lambda = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
+                          const std::vector<NDArray>& inputs,
                           const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
+                          const std::vector<NDArray>& outputs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
       for (auto kv : attrs.dict) {
@@ -350,18 +350,18 @@ int MXLoadLib(const char *path) {
 
       // convert input tensors to constituent parts
       for (size_t i = 0; i < inputs.size(); i++) {
-        in_data.push_back(inputs[i].dptr_);
-        in_shapes.push_back(inputs[i].shape_.data());
-        in_dims.push_back(inputs[i].shape_.ndim());
-        in_types.push_back(inputs[i].type_flag_);
+        in_data.push_back(inputs[i].data().dptr_);
+        in_shapes.push_back(inputs[i].shape().data());
+        in_dims.push_back(inputs[i].shape().ndim());
+        in_types.push_back(inputs[i].dtype());
       }
 
       // convert output tensors to constituent parts
       for (size_t i = 0; i < outputs.size(); i++) {
-        out_data.push_back(outputs[i].dptr_);
-        out_shapes.push_back(outputs[i].shape_.data());
-        out_dims.push_back(outputs[i].shape_.ndim());
-        out_types.push_back(outputs[i].type_flag_);
+        out_data.push_back(outputs[i].data().dptr_);
+        out_shapes.push_back(outputs[i].shape().data());
+        out_dims.push_back(outputs[i].shape().ndim());
+        out_types.push_back(outputs[i].dtype());
       }
 
       // get memory resource
@@ -388,7 +388,7 @@ int MXLoadLib(const char *path) {
       };
 
       // call fcompute function
-      CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callFComp(fcomp_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(),
                       in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(),
@@ -412,7 +412,7 @@ int MXLoadLib(const char *path) {
       int indices_size = 0;
 
       // call mutate inputs function
-      CHECK(callMutateInputs(mutate, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callMutateInputs(mutate_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       &mutate_indices, &indices_size))
       << "Error calling MutateInputs for custom operator '" << name_str << "'";
 
@@ -424,6 +424,15 @@ int MXLoadLib(const char *path) {
       return mutate_indices_list;
     };
 
+    auto infer_storage_type = [=](const nnvm::NodeAttrs& attrs,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int>* in_stypes,
+                                  std::vector<int>* out_stypes) {
+      return op::storage_type_assign(out_stypes, mxnet::kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
@@ -432,6 +441,7 @@ int MXLoadLib(const char *path) {
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
+      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
       regOp.set_attr<FResourceRequest>("FResourceRequest",
                                        [](const NodeAttrs& attrs) {
                                          return std::vector<ResourceRequest>{
@@ -440,26 +450,23 @@ int MXLoadLib(const char *path) {
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
-      if (mutate != nullptr)
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
+      if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
     } else {
       // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
-
       regOp.arguments.clear();
       regOp.add_argument("data", "NDArray[]", "Source inputs");
-
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
-      if (mutate != nullptr)
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
+      if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
     }
   }

From ff9a868eba45879a60fe6eca3c33b4bfa36baa42 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 07:13:18 +0000
Subject: [PATCH 028/111] changed fcompute to forward

---
 example/lib_ops/mylib.cc        |  6 +++---
 example/lib_ops/subgraph_lib.cc |  4 ++--
 include/mxnet/lib_api.h         | 11 ++++++++---
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 685ca4ddb2d4..5d09760d7b74 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -43,7 +43,7 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 }
 
 
-MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //validate inputs
@@ -139,13 +139,13 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
 }
 
 REGISTER_OP(gemm)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
 REGISTER_OP(warpctc)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape);
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index ddea2aefb654..97c2137931f0 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -70,7 +70,7 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //extract data pointers from tensors
@@ -95,7 +95,7 @@ MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
 }
 
 REGISTER_OP(subgraph_op)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index bf6b7ae8d156..2ba51ad28f47 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -120,13 +120,17 @@ typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
-    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
-    mutate_inputs(nullptr) {}
+    fgradient(nullptr), parse_attrs(nullptr), infer_type(nullptr),
+    infer_shape(nullptr), mutate_inputs(nullptr) {}
   ~CustomOp() {}
-  CustomOp& setFCompute(fcomp_t fcomp) {
+  CustomOp& setForward(fcomp_t fcomp) {
     fcompute = fcomp;
     return *this;
   }
+  CustomOp& setGradient(fcomp_t fcomp) {
+    fgradient = fcomp;
+    return *this;
+  }
   CustomOp& setParseAttrs(parseAttrs_t func) {
     parse_attrs = func;
     return *this;
@@ -147,6 +151,7 @@ class CustomOp {
   const char* name;
   /*! \brief operator functions */
   fcomp_t fcompute;
+  fcomp_t fgradient;
   parseAttrs_t parse_attrs;
   inferType_t infer_type;
   inferShape_t infer_shape;

From 0be218b78b2bf4bf26dd071c6272b0564c7c3fbc Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 07:43:06 +0000
Subject: [PATCH 029/111] initial commit with fgradient support

---
 include/mxnet/lib_api.h |  5 +++--
 src/c_api/c_api.cc      | 45 +++++++++++++++++++++++++----------------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 2ba51ad28f47..6c1166a7f147 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -238,7 +238,7 @@ class Registry {
 typedef int (*opRegSize_t)(void);
 
 #define MXLIB_OPREGGET_STR "_opRegGet"
-typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
+typedef int (*opRegGet_t)(int, const char**, fcomp_t*, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*, mutateInputs_t*);
 
@@ -307,12 +307,13 @@ extern "C" {
 #else
   void
 #endif
-  _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+  _opRegGet(int idx, const char** name, fcomp_t* fcomp, fcomp_t* fgrad,
             parseAttrs_t* parse, inferType_t* type,
             inferShape_t* shape, mutateInputs_t* mutate) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
+    *fgrad = op.fgradient;
     *parse = op.parse_attrs;
     *type = op.infer_type;
     *shape = op.infer_shape;
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7451a4201952..dd5cf7686dd0 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -147,6 +147,7 @@ int MXLoadLib(const char *path) {
     const char* name;
     // function pointers holding implementation from custom library
     fcomp_t fcomp_fp = nullptr;
+    fcomp_t fgrad_fp = nullptr;
     parseAttrs_t parse_fp = nullptr;
     inferType_t type_fp = nullptr;
     inferShape_t shape_fp = nullptr;
@@ -154,7 +155,7 @@ int MXLoadLib(const char *path) {
     mutateInputs_t mutate_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
+    opRegGet(i, &name, &fcomp_fp, &fgrad_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
 
     // validate custom operator functions from the dynamic library
     CHECK(fcomp_fp != nullptr) << "Error loading '" << name
@@ -435,32 +436,40 @@ int MXLoadLib(const char *path) {
 
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr_parser(attr_parser);
+    regOp.set_num_inputs(num_inputs);
+    regOp.set_num_outputs(num_outputs);
+    regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
+    regOp.set_attr<FResourceRequest>("FResourceRequest",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<ResourceRequest>{
+                                         ResourceRequest::kTempSpace};
+                                     });
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
-      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-      regOp.set_attr_parser(attr_parser);
-      regOp.set_num_inputs(num_inputs);
-      regOp.set_num_outputs(num_outputs);
-      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-      regOp.set_attr<FResourceRequest>("FResourceRequest",
-                                       [](const NodeAttrs& attrs) {
-                                         return std::vector<ResourceRequest>{
-                                           ResourceRequest::kTempSpace};
-                                       });
-      regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
+      if (fgrad_fp != nullptr) {
+        // regOp.set_attr<nnvm::FGradient>("FGradient");
+        std::string grad_name(std::string("_backward_") + name);
+        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
+        // gradOp.set_attr_parser();
+        // gradOp.set_num_inputs();
+        // gradOp.set_num_outputs();
+        // gradOp.set_attr<FInferStorageType>("FInferStorageType");
+        // gradOp.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+        // return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+        // })
+        // gradOp.set_attr<FComputeEx>("FComputeEx<cpu>");
+      }
     } else {
       // overwrite registration of existing op with custom op
-      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-      regOp.set_attr_parser(attr_parser);
-      regOp.set_num_inputs(num_inputs);
-      regOp.set_num_outputs(num_outputs);
       regOp.arguments.clear();
-      regOp.add_argument("data", "NDArray[]", "Source inputs");
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
@@ -468,7 +477,9 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
+      // TODO(samskalicky): add fgrad support here too
     }
+    regOp.add_argument("data", "NDArray[]", "Source inputs");
   }
 
   API_END();

From 570a059538c2f77adb24e7812a8ea3d240e7d0dd Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 15:58:02 +0000
Subject: [PATCH 030/111] enabled gradient registration

---
 src/c_api/c_api.cc | 130 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 104 insertions(+), 26 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index dd5cf7686dd0..20428835964d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -50,6 +50,7 @@
 #include "../initialize.h"
 #include "./c_api_common.h"
 #include "../operator/custom/custom-inl.h"
+#include "../operator/operator_common.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tvmop/op_module.h"
 #include "../common/utils.h"
@@ -230,6 +231,25 @@ int MXLoadLib(const char *path) {
       return num_out;
     };
 
+    // lambda function to call parse attributes and return the number of inputs and outputs
+    // for gradient computation
+    auto num_inouts = [=](const NodeAttrs& attrs) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      int num_in = -1;
+      int num_out = -1;
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
+
+      return num_in + num_out;
+    };
+    
     // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
@@ -332,11 +352,12 @@ int MXLoadLib(const char *path) {
     };
 
     // lambda function to convert from external fcompute to internal MXNet types
-    auto fcomp_lambda = [=](const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
+    auto fcomp_lambda = [=](fcomp_t fcomp_fp,
+                     const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
       for (auto kv : attrs.dict) {
@@ -399,6 +420,22 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
+    auto forward_lambda = [=](const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+      return fcomp_lambda(fcomp_fp, attrs, ctx, inputs, req, outputs);
+    };
+
+    auto gradient_lambda = [=](const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+      return fcomp_lambda(fgrad_fp, attrs, ctx, inputs, req, outputs);
+    };
+    
     // lambda function to convert from external mutate_inputs to internal MXNet types
     auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
       // convert attributes to vector of char*
@@ -425,15 +462,45 @@ int MXLoadLib(const char *path) {
       return mutate_indices_list;
     };
 
+    // lambda function to set storage types
     auto infer_storage_type = [=](const nnvm::NodeAttrs& attrs,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int>* in_stypes,
-                                  std::vector<int>* out_stypes) {
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int>* in_stypes,
+                                std::vector<int>* out_stypes) {
+      // set outputs as dense
       return op::storage_type_assign(out_stypes, mxnet::kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
     };
 
+    /*
+     * GradStruct
+     * this struct sets that the operator will use both the inputs and the outputs to compute
+     * the gradient. The order is: [grads, inputs, outputs]
+     */
+    struct GradStruct {
+      const char *op_name;
+      std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                              const std::vector<nnvm::NodeEntry>& ograds) const {
+        // copy gradients first
+        std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+        // copy inputs second
+        for (auto& h : n->inputs) {
+          heads.push_back(h);
+        }
+        // copy outputs last
+        uint32_t n_out = n->num_outputs();
+        for (uint32_t i = 0; i < n_out; ++i) {
+          heads.emplace_back(n, i, 0);
+        }
+        return mxnet::op::MakeGradNode(op_name, n, heads, n->attrs.dict);
+      }
+    };
+
+    auto resc_req = [=](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
@@ -441,31 +508,28 @@ int MXLoadLib(const char *path) {
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
     regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-    regOp.set_attr<FResourceRequest>("FResourceRequest",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<ResourceRequest>{
-                                         ResourceRequest::kTempSpace};
-                                     });
+    regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
+      // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
+      // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
-        // regOp.set_attr<nnvm::FGradient>("FGradient");
         std::string grad_name(std::string("_backward_") + name);
+        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()});
+
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
-        // gradOp.set_attr_parser();
-        // gradOp.set_num_inputs();
-        // gradOp.set_num_outputs();
-        // gradOp.set_attr<FInferStorageType>("FInferStorageType");
-        // gradOp.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-        // return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-        // })
-        // gradOp.set_attr<FComputeEx>("FComputeEx<cpu>");
+        gradOp.set_attr_parser(attr_parser);
+        gradOp.set_num_inputs(num_inouts);
+        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
+        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda);
       }
     } else {
       // overwrite registration of existing op with custom op
@@ -474,10 +538,24 @@ int MXLoadLib(const char *path) {
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
+      // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
-      // TODO(samskalicky): add fgrad support here too
+      // optionally add fgradient if user specified a function
+      if (fgrad_fp != nullptr) {
+        std::string grad_name(std::string("_backward_") + name);
+        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()}, 11);
+
+        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
+        gradOp.set_attr_parser(attr_parser);
+        gradOp.set_num_inputs(num_inouts);
+        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
+        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda, 11);
+      }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");
   }

From e4be1750156fd90b247f43d359532a0e0276535b Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 18:05:43 +0000
Subject: [PATCH 031/111] fixed whitespace

---
 src/c_api/c_api.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 20428835964d..1ba78e2a5c0d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -249,7 +249,7 @@ int MXLoadLib(const char *path) {
 
       return num_in + num_out;
     };
-    
+
     // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
@@ -435,7 +435,7 @@ int MXLoadLib(const char *path) {
                               const std::vector<NDArray>& outputs) {
       return fcomp_lambda(fgrad_fp, attrs, ctx, inputs, req, outputs);
     };
-    
+
     // lambda function to convert from external mutate_inputs to internal MXNet types
     auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
       // convert attributes to vector of char*
@@ -520,7 +520,7 @@ int MXLoadLib(const char *path) {
       // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
         std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()});
+        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()});
 
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
@@ -529,7 +529,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_outputs(num_outputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
-        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", gradient_lambda);
       }
     } else {
       // overwrite registration of existing op with custom op
@@ -545,7 +545,7 @@ int MXLoadLib(const char *path) {
       // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
         std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()}, 11);
+        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()}, 11);
 
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
@@ -554,7 +554,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_outputs(num_outputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
-        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda, 11);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", gradient_lambda, 11);
       }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");

From 8cfcc853162b55c071d59ac037f2768796322bc3 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 15 Aug 2019 21:15:43 +0000
Subject: [PATCH 032/111] fixed example to use absolute path

---
 example/lib_api/test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/example/lib_api/test.py b/example/lib_api/test.py
index d73d85c02ced..840924c1317c 100644
--- a/example/lib_api/test.py
+++ b/example/lib_api/test.py
@@ -26,6 +26,8 @@
 import os
 
 if (os.name=='posix'):
-    mx.library.load('mylib.so')
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
 elif (os.name=='nt'):
-    mx.library.load('mylib.dll')
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)

From 9884ec6702057363af81e28115024fb560e11d65 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 16 Aug 2019 07:17:03 +0000
Subject: [PATCH 033/111] added example for custom ops, added support for
 custom op registration

---
 Makefile                   |   2 +-
 example/lib_ops/Makefile   |  31 ++++++
 example/lib_ops/libtest.cc |  78 +++++++++++++++
 example/lib_ops/mylib.cc   | 107 ++++++++++++++++++++
 example/lib_ops/test.py    |  33 ++++++
 include/mxnet/lib_api.h    | 199 +++++++++++++++++++++++++++++++++++--
 src/c_api/c_api.cc         |  24 +++++
 7 files changed, 464 insertions(+), 10 deletions(-)
 create mode 100644 example/lib_ops/Makefile
 create mode 100644 example/lib_ops/libtest.cc
 create mode 100644 example/lib_ops/mylib.cc
 create mode 100644 example/lib_ops/test.py

diff --git a/Makefile b/Makefile
index 4d823d0ac284..5c06dc800c4b 100644
--- a/Makefile
+++ b/Makefile
@@ -662,7 +662,7 @@ pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 sample_lib:
-	$(CXX) -shared -fPIC example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
 
 doc: docs
 
diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
new file mode 100644
index 000000000000..f649a68eee9a
--- /dev/null
+++ b/example/lib_ops/Makefile
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all:
+	g++ -shared -fPIC -std=gnu++0x mylib.cc -o mylib.so -I ../../include/mxnet
+
+test:
+	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
+
+windows:
+	cl /LD mylib.cc
+
+win_test:
+	cl libtest.cc
+
+clean:
+	rm -rf mylib.so libtest
diff --git a/example/lib_ops/libtest.cc b/example/lib_ops/libtest.cc
new file mode 100644
index 000000000000..8bdf36c05d37
--- /dev/null
+++ b/example/lib_ops/libtest.cc
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file libtest.cc
+ * \brief This test checks if the library is implemented correctly
+ * and does not involve dynamic loading of library into MXNet
+ * This test is supposed to be run before test.py
+ */
+
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <iostream>
+#include "lib_api.h"
+
+#define MXNET_VERSION 10500
+
+int main(void) {
+  // Get a handle to the library.
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  HINSTANCE handle;
+  handle = LoadLibrary(TEXT("mylib.dll"));
+#else
+  void *handle;
+  handle = dlopen("mylib.so", RTLD_LAZY);
+#endif
+
+  if (!handle) {
+    std::cerr << "Unable to load library" << std::endl;
+    return 1;
+  }
+
+  // get initialize function address from the library
+  initialize_t init_lib;
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  init_lib = (initialize_t) GetProcAddress(handle, MXLIB_INITIALIZE_STR);
+#else
+  init_lib = (initialize_t) dlsym(handle, MXLIB_INITIALIZE_STR);
+#endif
+
+  if (!init_lib) {
+    std::cerr << "Unable to get function 'intialize' from library" << std::endl;
+    return 1;
+  }
+
+  // Call the function.
+  (init_lib)(MXNET_VERSION);
+
+  // Deallocate memory.
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  FreeLibrary(handle);
+#else
+  dlclose(handle);
+#endif
+
+  return 0;
+}
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
new file mode 100644
index 000000000000..89a012d310d5
--- /dev/null
+++ b/example/lib_ops/mylib.cc
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file mylib.cc
+ * \brief Sample library file
+ */
+
+#include <iostream>
+#include "lib_api.h"
+
+void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
+  unsigned i,j,kk;
+  for (i=0;i<n;i++) {
+    for (j=0;j<m;j++) {
+      C[i*m+j] = 0;
+      for (kk=0;kk<k;kk++) {
+        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
+      }
+    }
+  }
+}
+
+int myFCompute(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+
+  double* input1 = inputs[0].getData<double>();
+  double* input2 = inputs[1].getData<double>();
+  double* output = outputs[0].getData<double>();
+  unsigned n = inputs[0].shape[0];
+  unsigned k = inputs[0].shape[1];
+  unsigned m = inputs[1].shape[1];
+
+  gemm(input1, input2, output, n, k, m);
+  
+  return 1;
+}
+
+int parseAttrs(std::map<std::string,std::string> attrs,
+               int* num_in, int* num_out) {
+
+  if(attrs.find("myParam") == attrs.end()) {
+    std::cout << "Missing param 'myParam'" << std::endl;
+    return 0;
+  }
+
+  *num_in = 2;
+  *num_out = 1;
+
+  return 1; //no error
+}
+
+int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+              std::vector<int> &outtypes) {
+  outtypes[0] = intypes[0];
+  
+  return 1; //no error
+}
+
+int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+               std::vector<std::vector<unsigned int>> &outshapes) {
+  unsigned n = inshapes[0][0];
+  unsigned k = inshapes[0][1];
+  unsigned kk = inshapes[1][0];
+  unsigned m = inshapes[1][1];
+
+  if(k != kk) return 0;
+  
+  outshapes[0].push_back(n);
+  outshapes[0].push_back(m);
+
+  return 1; //no error
+}
+
+REGISTER_OP(sam)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+int initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return 1;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return 0;
+  }
+}
+
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
new file mode 100644
index 000000000000..840924c1317c
--- /dev/null
+++ b/example/lib_ops/test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+
+import mxnet as mx
+import os
+
+if (os.name=='posix'):
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('mylib.so')
+    mx.library.load(path)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index ca3b2952eafa..8668ca483326 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -25,26 +25,207 @@
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
+#include <vector>
+#include <map>
+#include <stdint.h>
+
+/*!
+ * \brief External Tensor data types
+ */
+enum MXDType {
+  kFloat32 = 0,
+  kFloat64 = 1,
+  kFloat16 = 2,
+  kUint8 = 3,
+  kInt32 = 4,
+  kInt8  = 5,
+  kInt64 = 6,
+};
+
+/*!
+ * \brief External Tensor data structure
+ */
+struct MXTensor {
+  MXTensor() { data = nullptr; }
+  MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
+  : data{data}, shape{shape}, dtype{dtype} {}
+
+  /*!
+   * \brief helper function to cast data pointer
+   */
+  template<typename data_type>
+  data_type* getData() {
+    return (data_type*)data;
+  }
+
+  void *data; // not owned
+  std::vector<int64_t> shape;
+  MXDType dtype;
+};
+
+/*!
+ * Custom Operator function templates
+ */
+typedef int (*fcomp_t)(std::map<std::string,std::string>,
+                       std::vector<MXTensor>, std::vector<MXTensor>);
+typedef int (*parseAttrs_t)(std::map<std::string,std::string>,
+                            int*, int*);
+typedef int (*inferType_t)(std::map<std::string,std::string>,
+                           std::vector<int>&, std::vector<int>&);
+typedef int (*inferShape_t)(std::map<std::string,std::string>,
+                            std::vector<std::vector<unsigned int>>&,
+                            std::vector<std::vector<unsigned int>>&);
+
+/*!
+ * \brief Class to hold custom operator registration
+ */
+class CustomOp {
+ public:
+ CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
+    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
+  ~CustomOp() {}
+  CustomOp& setFCompute(fcomp_t fcomp) {
+    fcompute = fcomp;
+    return *this;
+  }
+  CustomOp& setParseAttrs(parseAttrs_t func) {
+    parse_attrs = func;
+    return *this;
+  }
+  CustomOp& setInferType(inferType_t func) {
+    infer_type = func;
+    return *this;
+  }
+  CustomOp& setInferShape(inferShape_t func) {
+    infer_shape = func;
+    return *this;
+  }
+  /*! \brief operator name */
+  const char* name;
+  /*! \brief operator functions */
+  fcomp_t fcompute;
+  parseAttrs_t parse_attrs;
+  inferType_t infer_type;
+  inferShape_t infer_shape;
+};
+
+/*!
+ * \brief Registry class to registers things (ops, properties)
+ *       Singleton class
+ */
+template <class T>
+class Registry {
+ public:
+  /*!
+   * \brief get singleton pointer to class
+   * \returns pointer to class
+   */
+  static Registry* get() {
+    static Registry inst;
+    return &inst;
+  }
+  /*!
+   * \brief add a new entry
+   * \returns new object associated with registered name
+   */
+  T& add(const char* name) {
+    T *entry = new T(name);
+    entries.push_back(entry);
+    return *entry;
+  }
+  int size() {
+    return entries.size();
+  }
+  T& get(int idx) {
+    return *(entries[idx]);
+  }
+ private:
+  /*! \brief constructor */
+  Registry() {}
+  /*! \brief destructor */
+  ~Registry() {}
+  /*! \brief map of entries in registry */
+  std::vector<T*> entries;
+};
+  
+
+/*
+ * Macros to help with string concat
+ * Annoyingly, the concat_ and concat macros are necessary to
+ * be able to use __COUNTER__ in an identifier name 
+ */
+#define _STR_CONCAT_(__a, __b) __a ## __b
+#define _STR_CONCAT(__a, __b) _STR_CONCAT_(__a, __b)
+
+/*!
+ * \brief convert a token to a string
+ */
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+/*!
+ * \brief declare a variable with custom name
+ */
+#define _REGISTER_NAME_(Name) MXNet ## _CustomOp ## _
+#define _REGISTER_DEF_(Name) CustomOp _REGISTER_NAME_(Name)
+
+/*!
+ * \brief assign a var to a value
+ */
+#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = Registry<CustomOp>::get()->add(TOSTRING(Name))
+
+
 /*!
  * \brief Following are the APIs implemented in the external library
  * Each API has a #define string that is used to lookup the function in the library
  * Followed by the function declaration
  */
+
+
+#define MXLIB_OPREGSIZE_STR "_opRegSize"
+typedef int (*opRegSize_t)(void);
+
+#define MXLIB_OPREGGET_STR "_opRegGet"
+typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
+                          parseAttrs_t*, inferType_t*,
+                          inferShape_t*);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
 extern "C" {
-    /*!
-     * \brief Checks if the MXNet version is supported by the library.
-     * If supported, initializes the library.
-     * \param version MXNet version number passed to library and defined as:
-     *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
-     * \return Non-zero value on error i.e. library incompatible with passed MXNet version
-     */
+  /*!
+   * \brief returns number of ops registered in this library
+   */
+  int _opRegSize() {
+    return Registry<CustomOp>::get()->size();
+  }
+
+  /*!
+   * \brief returns operator registration at specified index
+   */
+  void _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+                 parseAttrs_t* parse, inferType_t* type,
+                 inferShape_t* shape) {
+    CustomOp op = Registry<CustomOp>::get()->get(idx);
+    *name = op.name;
+    *fcomp = op.fcompute;
+    *parse = op.parse_attrs;
+    *type = op.infer_type;
+    *shape = op.infer_shape;
+  }
+
+  /*!
+   * \brief Checks if the MXNet version is supported by the library.
+   * If supported, initializes the library.
+   * \param version MXNet version number passed to library and defined as:
+   *                MXNET_VERSION = (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
+   * \return Non-zero value on error i.e. library incompatible with passed MXNet version
+   */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-    __declspec(dllexport) int __cdecl initialize(int);
+  __declspec(dllexport) int __cdecl initialize(int);
 #else
-    int initialize(int);
+  int initialize(int);
 #endif
 }
 #endif  // MXNET_LIB_API_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f3e0ba8f5c26..50c91811e679 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -102,6 +102,30 @@ int MXLoadLib(const char *path) {
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
+
+  opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
+  int numOps = opRegSize();
+  LOG(INFO) << "Found " << numOps << " operators in library";
+
+  opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
+  for(int i=0; i<numOps; i++) {
+    const char* name;
+    fcomp_t fcomp = nullptr;
+    parseAttrs_t parse = nullptr;
+    inferType_t type = nullptr;
+    inferShape_t shape = nullptr;
+    
+    opRegGet(i,&name, &fcomp, &parse, &type, &shape);
+
+    CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
+    CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
+    CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
+    CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
+    
+    LOG(INFO) << "\tOp[" << i << "] " << name;
+
+  }
+  
   API_END();
 }
 

From 8e2160081ce6c72428630b1125d764e1a3c3c484 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sat, 17 Aug 2019 02:39:05 +0000
Subject: [PATCH 034/111] added fcompute registration for loaded operators
 moved library import order to after ndarray/symbol

---
 example/lib_ops/test.py  |  2 ++
 include/mxnet/lib_api.h  | 37 +++++++++++++++++++++++++++
 python/mxnet/__init__.py |  3 ++-
 python/mxnet/library.py  |  8 +++++-
 src/c_api/c_api.cc       | 55 +++++++++++++++++++++++++++++++++++++++-
 5 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index 840924c1317c..8c7ccf02c886 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -31,3 +31,5 @@
 elif (os.name=='nt'):
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
+
+print(mx.nd.sam)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 8668ca483326..4ea284e39754 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,6 +190,11 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
+typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
+                             const int64_t**, int*, void**, int*, int,
+                             const int64_t**, int*, void**, int*, int);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -215,6 +220,38 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
+                      const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
+                      const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    //create a vector of tensors for inputs
+    std::vector<MXTensor> inputs(num_in);
+    for(int i=0; i<num_in; i++) {
+      inputs[i].data = indata[i];
+      inputs[i].dtype = (MXDType)intypes[i];
+      for(int j=0; j<indims[i]; j++) {
+        inputs[i].shape.push_back(inshapes[i][j]);
+      }
+    }
+
+    //create a vector of tensors for outputs
+    std::vector<MXTensor> outputs(num_out);
+    for(int i=0; i<num_out; i++) {
+      outputs[i].data = outdata[i];
+      outputs[i].dtype = (MXDType)outtypes[i];
+      for(int j=0; j<outdims[i]; j++) {
+        outputs[i].shape.push_back(outshapes[i][j]);
+      }
+    }
+
+    return fcomp(attrs,inputs,outputs);
+  }
+  
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e9c1229d7f2f..c76d78f54331 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -27,7 +27,6 @@
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
 from .util import is_np_array, np_array, use_np_array, use_np
 from . import base
-from . import library
 from . import contrib
 from . import ndarray
 from . import ndarray as nd
@@ -87,6 +86,8 @@
 
 from . import gluon
 
+from . import library
+
 __version__ = base.__version__
 
 # Dist kvstore module which launches a separate process when role is set to "server".
diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index 9ebf2c2bc580..ec6e47d93f12 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -20,7 +20,9 @@
 from __future__ import absolute_import
 import ctypes
 import os
-from .base import _LIB, check_call, MXNetError
+from .base import _LIB, check_call, MXNetError, _init_op_module
+from .ndarray.register import _make_ndarray_function
+from .symbol.register import _make_symbol_function
 
 def load(path):
     """Loads library dynamically.
@@ -47,3 +49,7 @@ def load(path):
     byt_obj = path.encode('utf-8')
     chararr = ctypes.c_char_p(byt_obj)
     check_call(_LIB.MXLoadLib(chararr))
+
+    #regenerate operators
+    _init_op_module('mxnet', 'ndarray', _make_ndarray_function)
+    _init_op_module('mxnet', 'symbol', _make_symbol_function)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 50c91811e679..f3a38a7ff6a3 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -99,14 +99,20 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
+  //initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
+  //get function to call fcompute
+  opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
+  
+  //get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
+  //loop and register each operator in the library
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for(int i=0; i<numOps; i++) {
     const char* name;
@@ -114,9 +120,11 @@ int MXLoadLib(const char *path) {
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
-    
+
+    //get operator from the library
     opRegGet(i,&name, &fcomp, &parse, &type, &shape);
 
+    //validate operator in the library
     CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
     CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
@@ -124,6 +132,51 @@ int MXLoadLib(const char *path) {
     
     LOG(INFO) << "\tOp[" << i << "] " << name;
 
+    std::string name_str(name);
+    //generate lambda functions to convert from MXNet types to external types
+    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys,attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
+      std::vector<void*> in_data, out_data;
+      std::vector<const int64_t *> in_shapes, out_shapes;
+      std::vector<int> in_dims, out_dims;
+      std::vector<int> in_types, out_types;
+
+      //convert input tensors to constituant parts
+      for(size_t i=0; i<inputs.size(); i++) {
+        in_data.push_back(inputs[i].dptr_);
+        in_shapes.push_back(inputs[i].shape_.data());
+        in_dims.push_back(inputs[i].shape_.ndim());
+        in_types.push_back(inputs[i].type_flag_);
+      }
+
+      //convert output tensors to constituant parts
+      for(size_t i=0; i<outputs.size(); i++) {
+        out_data.push_back(outputs[i].dptr_);
+        out_shapes.push_back(outputs[i].shape_.data());
+        out_dims.push_back(outputs[i].shape_.ndim());
+        out_types.push_back(outputs[i].type_flag_);
+      }
+
+      //call fcompute function
+      CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                      in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
+                      out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
+            << "Error calling FCompute for custom operator '" << name_str << "'";
+    };
+
+    //re-register op in MXNet using lambda converter functions
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
   }
   
   API_END();

From 794e30bb7463020f09145967597fa9ca27394f7b Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sat, 17 Aug 2019 05:58:56 +0000
Subject: [PATCH 035/111] changed dynamic ops to be contrib

---
 example/lib_ops/test.py | 2 +-
 src/c_api/c_api.cc      | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index 8c7ccf02c886..bb2db9ab4be6 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -32,4 +32,4 @@
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
 
-print(mx.nd.sam)
+print(mx.nd.contrib.sam)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index f3a38a7ff6a3..19dc96eac636 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,3 +1,4 @@
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -175,7 +176,9 @@ int MXLoadLib(const char *path) {
     };
 
     //re-register op in MXNet using lambda converter functions
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    std::string contrib_name("_contrib_");
+    contrib_name += name;
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
   }
   

From 8fbf664963c2cce2304b438adf4898b9b1175eda Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Sun, 18 Aug 2019 07:41:27 +0000
Subject: [PATCH 036/111] added num in/out

---
 include/mxnet/lib_api.h | 16 +++++++++++++
 src/c_api/c_api.cc      | 53 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 4ea284e39754..47ed086333fd 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,6 +190,10 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLPARSEATTRS_STR "_opCallParseAttrs"
+typedef int (*opCallParseAttrs_t)(parseAttrs_t, const char* const*, const char* const*, int,
+                                  int*, int*);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -220,6 +224,18 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
+                        int* num_in, int* num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    return parseAttrs(attrs,num_in,num_out);
+  }
+  
+  
   int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
                       const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 19dc96eac636..707d12349627 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -105,7 +105,8 @@ int MXLoadLib(const char *path) {
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  //get function to call fcompute
+  //get call functions
+  opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
   opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
   //get number of operators registered in the library
@@ -132,14 +133,26 @@ int MXLoadLib(const char *path) {
     CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
     
     LOG(INFO) << "\tOp[" << i << "] " << name;
-
     std::string name_str(name);
-    //generate lambda functions to convert from MXNet types to external types
-    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
+
+    auto num_inputs = [=](const NodeAttrs& attrs) {
+      //convert attributes to vector of char
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      
+      return num_in;
+    };
+
+    auto num_outputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char*
       std::vector<const char*> attr_keys,attr_vals;
       for(auto kv : attrs.dict) {
@@ -147,6 +160,28 @@ int MXLoadLib(const char *path) {
         attr_vals.push_back(kv.second.c_str());
       }
       
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      
+      return num_out;
+    };
+
+    // lambda function to convert from external fcompute to internal MXNet types
+    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
       std::vector<void*> in_data, out_data;
       std::vector<const int64_t *> in_shapes, out_shapes;
       std::vector<int> in_dims, out_dims;
@@ -180,6 +215,8 @@ int MXLoadLib(const char *path) {
     contrib_name += name;
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
+    regOp.set_num_inputs(num_inputs);
+    regOp.set_num_outputs(num_outputs);
   }
   
   API_END();

From e7c6e8fe9f88fc081c3fc2c5f48c4af83b0ff795 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 05:26:15 +0000
Subject: [PATCH 037/111] removed contrib op registration re-registered ops
 from mx.nd.op to mx.nd

---
 python/mxnet/library.py | 15 +++++++++++++++
 src/c_api/c_api.cc      |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index ec6e47d93f12..fcf39815d8e9 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -19,6 +19,7 @@
 """Library management API of mxnet."""
 from __future__ import absolute_import
 import ctypes
+import sys
 import os
 from .base import _LIB, check_call, MXNetError, _init_op_module
 from .ndarray.register import _make_ndarray_function
@@ -53,3 +54,17 @@ def load(path):
     #regenerate operators
     _init_op_module('mxnet', 'ndarray', _make_ndarray_function)
     _init_op_module('mxnet', 'symbol', _make_symbol_function)
+
+    #re-register mx.nd.op into mx.nd
+    mx_nd = sys.modules["mxnet.ndarray"]
+    mx_nd_op = sys.modules["mxnet.ndarray.op"]
+    for op in dir(mx_nd_op):
+        func = getattr(mx_nd_op,op)
+        setattr(mx_nd,op,func)
+
+    #re-register mx.sym.op into mx.sym
+    mx_sym = sys.modules["mxnet.symbol"]
+    mx_sym_op = sys.modules["mxnet.symbol.op"]
+    for op in dir(mx_sym_op):
+        func = getattr(mx_sym_op,op)
+        setattr(mx_sym,op,func)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 707d12349627..6bd309951b0e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -211,7 +211,8 @@ int MXLoadLib(const char *path) {
     };
 
     //re-register op in MXNet using lambda converter functions
-    std::string contrib_name("_contrib_");
+    //std::string contrib_name("_contrib_");
+    std::string contrib_name("");
     contrib_name += name;
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
     regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);

From 60473785166397b9bd15f6f8055448622d928550 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 06:59:57 +0000
Subject: [PATCH 038/111] added support for infer shape, updated example to
 call operator

---
 example/lib_ops/mylib.cc |  46 ++++++++++++---
 example/lib_ops/test.py  |  12 +++-
 include/mxnet/lib_api.h  |  66 ++++++++++++++++++++-
 src/c_api/c_api.cc       | 123 +++++++++++++++++++++++++++++++++++----
 4 files changed, 226 insertions(+), 21 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 89a012d310d5..5916c9d46683 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -26,7 +26,10 @@
 #include <iostream>
 #include "lib_api.h"
 
-void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
+/*
+ * main matrix multiplication routine
+ */
+void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   unsigned i,j,kk;
   for (i=0;i<n;i++) {
     for (j=0;j<m;j++) {
@@ -38,29 +41,39 @@ void gemm(double* A, double* B, double* C, unsigned n, unsigned k, unsigned m) {
   }
 }
 
+
 int myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+  //validate inputs
+  for(int i=0; i<inputs.size(); i++) {
+    if(inputs[i].dtype != kFloat32) {
+      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
+      return 0;
+    }
+  }
 
-  double* input1 = inputs[0].getData<double>();
-  double* input2 = inputs[1].getData<double>();
-  double* output = outputs[0].getData<double>();
+  //extract data pointers from tensors
+  float* input1 = inputs[0].getData<float>();
+  float* input2 = inputs[1].getData<float>();
+  float* output = outputs[0].getData<float>();
+  //set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
   unsigned m = inputs[1].shape[1];
 
   gemm(input1, input2, output, n, k, m);
   
-  return 1;
+  return 1; //no error
 }
 
 int parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
-
+  /*
   if(attrs.find("myParam") == attrs.end()) {
     std::cout << "Missing param 'myParam'" << std::endl;
     return 0;
   }
-
+  */
   *num_in = 2;
   *num_out = 1;
 
@@ -76,11 +89,30 @@ int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes
 
 int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
+  //validate inputs
+  if(inshapes.size() != 2) {
+    std::cout << "Expected 2 inputs to inferShape" << std::endl;
+    return 0;
+  }
+
+  if(inshapes[0].size() != 2) {
+    std::cout << "Expected 2D for first input to inferShape" << std::endl;
+    return 0;
+  }
+
+  if(inshapes[1].size() != 2) {
+    std::cout << "Expected 2D for second input to inferShape" << std::endl;
+    return 0;
+  }
+  
   unsigned n = inshapes[0][0];
   unsigned k = inshapes[0][1];
   unsigned kk = inshapes[1][0];
   unsigned m = inshapes[1][1];
 
+  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
+  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
+  
   if(k != kk) return 0;
   
   outshapes[0].push_back(n);
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index bb2db9ab4be6..cdf78bc5c9c3 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -25,6 +25,7 @@
 import mxnet as mx
 import os
 
+#load library
 if (os.name=='posix'):
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
@@ -32,4 +33,13 @@
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
 
-print(mx.nd.contrib.sam)
+#setup inputs to call test operator
+a = mx.nd.array([[1,2],[3,4]])
+b = mx.nd.array([[5,6],[7,8]])
+
+#print inputs
+print(a)
+print(b)
+
+#compute and print output
+print(mx.nd.sam(a,b))
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 47ed086333fd..e19d67cccee4 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -190,10 +190,18 @@ typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*);
 
+#define MXLIB_OPCALLFREE_STR "_opCallFree"
+typedef int (*opCallFree_t)(void*);
+
 #define MXLIB_OPCALLPARSEATTRS_STR "_opCallParseAttrs"
 typedef int (*opCallParseAttrs_t)(parseAttrs_t, const char* const*, const char* const*, int,
                                   int*, int*);
 
+#define MXLIB_OPCALLINFERSHAPE_STR "_opCallInferShape"
+typedef int (*opCallInferShape_t)(inferShape_t, const char* const*, const char* const*, int,
+                                  unsigned int**, int*, int,
+                                  unsigned int***, int**, int);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -224,6 +232,16 @@ extern "C" {
     *shape = op.infer_shape;
   }
 
+  /*!
+   * \brief calls free from the external library for library allocated arrays
+   */
+  void _opCallFree(void* ptr) {
+    free(ptr);
+  }
+
+  /*!
+   * \brief returns status of calling parse attributes function for operator from library
+   */
   int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
                         int* num_in, int* num_out) {
     //create map of attributes from list
@@ -234,8 +252,52 @@ extern "C" {
 
     return parseAttrs(attrs,num_in,num_out);
   }
-  
-  
+
+  /*!
+   * \brief returns status of calling infer shape function for operator from library
+   */
+  int _opCallInferShape(inferShape_t inferShape, const char* const* keys, const char* const* vals, int num,
+                        unsigned int** inshapes, int* indims, int num_in,
+                        unsigned int*** outshapes, int** outdims, int num_out) {
+    //create map of attributes from list
+    std::map<std::string,std::string> attrs;
+    for(int i=0; i<num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    //create a vector of shapes for inputs
+    std::vector<std::vector<unsigned int> > in_shapes(num_in);
+    for(int i=0; i<num_in; i++) {
+      for(int j=0; j<indims[i]; j++) {
+        in_shapes[i].push_back(inshapes[i][j]);
+      }
+    }
+
+    //create a vector of shapes for outputs
+    std::vector<std::vector<unsigned int> > out_shapes(num_out);
+
+    int retval = inferShape(attrs,in_shapes,out_shapes);
+    if(!retval) return retval;
+
+    //allocate space for output dims, shape
+    *outdims = (int*)malloc(num_out*sizeof(int));
+    *outshapes = (unsigned**)malloc(num_out*sizeof(unsigned*));
+
+    //copy output shapes
+    for(int i=0; i<num_out; i++) {
+      (*outdims)[i] = out_shapes[i].size();
+      (*outshapes)[i] = (unsigned*)malloc((*outdims)[i]*sizeof(unsigned));
+      for(int j=0; j<indims[i]; j++) {
+        (*outshapes)[i][j] = out_shapes[i][j];
+      }
+    }
+
+    return retval;
+  }
+
+  /*!
+   * \brief returns status of calling FCompute function for operator from library
+   */
   int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
                       const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 6bd309951b0e..1bbb621d6a72 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -106,7 +106,9 @@ int MXLoadLib(const char *path) {
     LOG(FATAL) << "Library failed to initialize";
 
   //get call functions
+  opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
   opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
+  opCallInferShape_t callInferShape = get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
   opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
   //get number of operators registered in the library
@@ -126,7 +128,7 @@ int MXLoadLib(const char *path) {
     //get operator from the library
     opRegGet(i,&name, &fcomp, &parse, &type, &shape);
 
-    //validate operator in the library
+    //validate operator functions from the library
     CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
     CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
@@ -135,6 +137,31 @@ int MXLoadLib(const char *path) {
     LOG(INFO) << "\tOp[" << i << "] " << name;
     std::string name_str(name);
 
+    /*
+     * Below are a series of lambda functions that will be registered in the NNVM op registration
+     * Each one has the standard MXNet signature and converts to types supported by externally
+     * registered operators. 
+     */
+    
+    //lambda function to call parse attributes
+    auto attr_parser = [=](const NodeAttrs* attrs) {
+      //convert attributes to vector of char
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs->dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+      
+      int num_in=-1;
+      int num_out=-1;
+      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+
+      //return type void
+    };
+
+    //lambda function to call parse attributes and return the number of inputs
     auto num_inputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
@@ -147,11 +174,12 @@ int MXLoadLib(const char *path) {
       int num_out=-1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
-      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
       
       return num_in;
     };
 
+    //lambda function to call parse attributes and return the number of outputs
     auto num_outputs = [=](const NodeAttrs& attrs) {
       //convert attributes to vector of char*
       std::vector<const char*> attr_keys,attr_vals;
@@ -164,11 +192,82 @@ int MXLoadLib(const char *path) {
       int num_out=-1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
-      << "Error calling ParseAttrs for custom operator '" << name_str << "'";
+      << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
       
       return num_out;
     };
 
+    //lambda function to call infer shape
+    auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
+                            mxnet::ShapeVector *in_shape,
+                            mxnet::ShapeVector *out_shape) {
+      //convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for(auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      std::vector<uint32_t*> inshapes(in_shape->size());
+      std::vector<int> indims(in_shape->size());
+
+      //determine amount of memory needed to store all the input shapes
+      size_t buff_size = 0;
+      for (const auto& i : *in_shape) buff_size += i.ndim();
+
+      //copy input shapes from ShapeVector to raw memory layout
+      std::vector<uint32_t> inbuff(buff_size);
+      uint32_t *ptr = inbuff.data();
+      for (size_t i = 0; i < in_shape->size(); ++i) {
+        inshapes[i] = ptr;
+        indims[i] = (*in_shape)[i].ndim();
+        for (int j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
+          *ptr = static_cast<uint32_t>((*in_shape)[i][j]);
+        }
+      }
+
+      //output shapes will be allocated by infer shape function
+      uint32_t** outshapes = nullptr;
+      int* outdims = nullptr;
+
+      CHECK(callInferShape(shape, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           inshapes.data(), indims.data(), in_shape->size(),
+                           &outshapes, &outdims, out_shape->size()))
+      << "Error calling InferShape for custom operator '" << name_str << "'";
+
+      std::vector<uint32_t*> out_shapes(out_shape->size());
+      //determine amount of memory needed to store all the output shapes
+      buff_size = 0;
+      for (unsigned i=0; i<out_shape->size(); i++) {
+        buff_size += outdims[i];
+      }
+
+      //copy output shapes from custom op memory to MXNet memory
+      std::vector<uint32_t> outbuff(buff_size);
+      ptr = outbuff.data();
+      for (unsigned i = 0; i < out_shape->size(); ++i) {
+        out_shapes[i] = ptr;
+        for (int j = 0; j < outdims[i]; ++j, ++ptr) {
+          *ptr = static_cast<uint32_t>(outshapes[i][j]);
+        }
+      }
+
+      //assign output shapes to ShapeVector
+      for (unsigned i = 0; i < out_shape->size(); ++i) {
+        SHAPE_ASSIGN_CHECK(*out_shape, i,
+                           mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
+      }
+
+      //free memory used by custom op to allocate shapes/dims
+      callFree(outdims);
+      for(unsigned i=0; i<out_shape->size(); i++) {
+        callFree(outshapes[i]);
+      }
+      callFree(outshapes);
+      
+      return true;
+    };
+    
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -187,7 +286,7 @@ int MXLoadLib(const char *path) {
       std::vector<int> in_dims, out_dims;
       std::vector<int> in_types, out_types;
 
-      //convert input tensors to constituant parts
+      //convert input tensors to constituent parts
       for(size_t i=0; i<inputs.size(); i++) {
         in_data.push_back(inputs[i].dptr_);
         in_shapes.push_back(inputs[i].shape_.data());
@@ -195,7 +294,7 @@ int MXLoadLib(const char *path) {
         in_types.push_back(inputs[i].type_flag_);
       }
 
-      //convert output tensors to constituant parts
+      //convert output tensors to constituent parts
       for(size_t i=0; i<outputs.size(); i++) {
         out_data.push_back(outputs[i].dptr_);
         out_shapes.push_back(outputs[i].shape_.data());
@@ -207,17 +306,19 @@ int MXLoadLib(const char *path) {
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
-            << "Error calling FCompute for custom operator '" << name_str << "'";
+      << "Error calling FCompute for custom operator '" << name_str << "'";
+
+      //return type void
     };
 
     //re-register op in MXNet using lambda converter functions
-    //std::string contrib_name("_contrib_");
-    std::string contrib_name("");
-    contrib_name += name;
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(contrib_name.c_str());
-    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
+    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
+    regOp.add_argument("data", "NDArray[]", "Source inputs");
+    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);      
   }
   
   API_END();

From d1587ab5de736b5f485e02a7b2a5e368393ef450 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 07:47:47 +0000
Subject: [PATCH 039/111] fixed whitespace

---
 include/mxnet/lib_api.h | 101 ++++++++++++++++++----------------
 src/c_api/c_api.cc      | 118 ++++++++++++++++++++++------------------
 2 files changed, 119 insertions(+), 100 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index e19d67cccee4..1f17b0344586 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -25,9 +25,10 @@
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
+#include <stdint.h>
 #include <vector>
 #include <map>
-#include <stdint.h>
+#include <string>
 
 /*!
  * \brief External Tensor data types
@@ -55,10 +56,10 @@ struct MXTensor {
    */
   template<typename data_type>
   data_type* getData() {
-    return (data_type*)data;
+    return reinterpret_cast<data_type*>(data);
   }
 
-  void *data; // not owned
+  void *data;  // not owned
   std::vector<int64_t> shape;
   MXDType dtype;
 };
@@ -66,13 +67,13 @@ struct MXTensor {
 /*!
  * Custom Operator function templates
  */
-typedef int (*fcomp_t)(std::map<std::string,std::string>,
+typedef int (*fcomp_t)(std::map<std::string, std::string>,
                        std::vector<MXTensor>, std::vector<MXTensor>);
-typedef int (*parseAttrs_t)(std::map<std::string,std::string>,
+typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
                             int*, int*);
-typedef int (*inferType_t)(std::map<std::string,std::string>,
+typedef int (*inferType_t)(std::map<std::string, std::string>,
                            std::vector<int>&, std::vector<int>&);
-typedef int (*inferShape_t)(std::map<std::string,std::string>,
+typedef int (*inferShape_t)(std::map<std::string, std::string>,
                             std::vector<std::vector<unsigned int>>&,
                             std::vector<std::vector<unsigned int>>&);
 
@@ -80,8 +81,8 @@ typedef int (*inferShape_t)(std::map<std::string,std::string>,
  * \brief Class to hold custom operator registration
  */
 class CustomOp {
- public:
- CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
+  public:
+  explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
     parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
   ~CustomOp() {}
   CustomOp& setFCompute(fcomp_t fcomp) {
@@ -139,6 +140,7 @@ class Registry {
   T& get(int idx) {
     return *(entries[idx]);
   }
+  
  private:
   /*! \brief constructor */
   Registry() {}
@@ -146,8 +148,7 @@ class Registry {
   ~Registry() {}
   /*! \brief map of entries in registry */
   std::vector<T*> entries;
-};
-  
+};  
 
 /*
  * Macros to help with string concat
@@ -172,7 +173,8 @@ class Registry {
 /*!
  * \brief assign a var to a value
  */
-#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = Registry<CustomOp>::get()->add(TOSTRING(Name))
+#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
+    Registry<CustomOp>::get()->add(TOSTRING(Name))
 
 
 /*!
@@ -242,52 +244,54 @@ extern "C" {
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
-  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys, const char* const* vals, int num,
+  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+                        const char* const* vals, int num,
                         int* num_in, int* num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    return parseAttrs(attrs,num_in,num_out);
+    return parseAttrs(attrs, num_in, num_out);
   }
 
   /*!
    * \brief returns status of calling infer shape function for operator from library
    */
-  int _opCallInferShape(inferShape_t inferShape, const char* const* keys, const char* const* vals, int num,
+  int _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+                        const char* const* vals, int num,
                         unsigned int** inshapes, int* indims, int num_in,
                         unsigned int*** outshapes, int** outdims, int num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    //create a vector of shapes for inputs
+    // create a vector of shapes for inputs
     std::vector<std::vector<unsigned int> > in_shapes(num_in);
-    for(int i=0; i<num_in; i++) {
-      for(int j=0; j<indims[i]; j++) {
+    for (int i = 0; i < num_in; i++) {
+      for (int j = 0; j < indims[i]; j++) {
         in_shapes[i].push_back(inshapes[i][j]);
       }
     }
 
-    //create a vector of shapes for outputs
+    // create a vector of shapes for outputs
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
-    int retval = inferShape(attrs,in_shapes,out_shapes);
+    int retval = inferShape(attrs, in_shapes, out_shapes);
     if(!retval) return retval;
 
-    //allocate space for output dims, shape
-    *outdims = (int*)malloc(num_out*sizeof(int));
-    *outshapes = (unsigned**)malloc(num_out*sizeof(unsigned*));
+    // allocate space for output dims, shape
+    *outdims = (int*) malloc (num_out * sizeof(int));
+    *outshapes = (unsigned**) malloc (num_out * sizeof(unsigned*));
 
-    //copy output shapes
-    for(int i=0; i<num_out; i++) {
+    // copy output shapes
+    for (int i = 0; i < num_out; i++) {
       (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = (unsigned*)malloc((*outdims)[i]*sizeof(unsigned));
-      for(int j=0; j<indims[i]; j++) {
+      (*outshapes)[i] = (unsigned*) malloc ((*outdims)[i] * sizeof(unsigned));
+      for (int j = 0; j < indims[i]; j++) {
         (*outshapes)[i][j] = out_shapes[i][j];
       }
     }
@@ -298,36 +302,39 @@ extern "C" {
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
-  int _opCallFCompute(fcomp_t fcomp, const char* const* keys, const char* const* vals, int num,
-                      const int64_t** inshapes, int* indims, void** indata, int* intypes, int num_in,
-                      const int64_t** outshapes, int* outdims, void** outdata, int* outtypes, int num_out) {
-    //create map of attributes from list
-    std::map<std::string,std::string> attrs;
-    for(int i=0; i<num; i++) {
+  int _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+                      const char* const* vals, int num,
+                      const int64_t** inshapes, int* indims,
+                      void** indata, int* intypes, int num_in,
+                      const int64_t** outshapes, int* outdims,
+                      void** outdata, int* outtypes, int num_out) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
     }
 
-    //create a vector of tensors for inputs
+    // create a vector of tensors for inputs
     std::vector<MXTensor> inputs(num_in);
-    for(int i=0; i<num_in; i++) {
+    for (int i = 0; i < num_in; i++) {
       inputs[i].data = indata[i];
       inputs[i].dtype = (MXDType)intypes[i];
-      for(int j=0; j<indims[i]; j++) {
+      for (int j = 0; j < indims[i]; j++) {
         inputs[i].shape.push_back(inshapes[i][j]);
       }
     }
 
-    //create a vector of tensors for outputs
+    // create a vector of tensors for outputs
     std::vector<MXTensor> outputs(num_out);
-    for(int i=0; i<num_out; i++) {
+    for (int i = 0; i < num_out; i++) {
       outputs[i].data = outdata[i];
-      outputs[i].dtype = (MXDType)outtypes[i];
-      for(int j=0; j<outdims[i]; j++) {
+      outputs[i].dtype = (MXDType) outtypes[i];
+      for (int j = 0; j < outdims[i]; j++) {
         outputs[i].shape.push_back(outshapes[i][j]);
       }
     }
 
-    return fcomp(attrs,inputs,outputs);
+    return fcomp(attrs, inputs, outputs);
   }
   
   /*!
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1bbb621d6a72..5ebdb8dc8c38 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -100,40 +100,50 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
-  //initialize library by passing MXNet version
+  // initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  //get call functions
+  // get call functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
-  opCallParseAttrs_t callParseAttrs = get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
-  opCallInferShape_t callInferShape = get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
-  opCallFComp_t callFComp = get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
   
-  //get number of operators registered in the library
+  opCallParseAttrs_t callParseAttrs =
+    get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
+  
+  opCallInferShape_t callInferShape =
+    get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
+  
+  opCallFComp_t callFComp =
+    get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
+
+  // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
-  //loop and register each operator in the library
+  // loop and register each operator in the library
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
-  for(int i=0; i<numOps; i++) {
+  for (int i = 0; i < numOps; i++) {
     const char* name;
     fcomp_t fcomp = nullptr;
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
 
-    //get operator from the library
-    opRegGet(i,&name, &fcomp, &parse, &type, &shape);
+    // get operator from the library
+    opRegGet(i, &name, &fcomp, &parse, &type, &shape);
+
+    // validate operator functions from the library
+    CHECK(fcomp != nullptr) << "Error loading '" << name
+                            << "' custom op, FCompute function was not set.";
+    CHECK(parse != nullptr) << "Error loading '" << name
+                            << "' custom op, ParseAttrs function was not set.";
+    CHECK(type  != nullptr) << "Error loading '" << name
+                            << "' custom op, InferType function was not set.";
+    CHECK(shape != nullptr) << "Error loading '" << name
+                            << "' custom op, InferShape function was not set.";
 
-    //validate operator functions from the library
-    CHECK(fcomp != nullptr) << "Error loading '" << name << "' custom op, FCompute function was not set.";
-    CHECK(parse != nullptr) << "Error loading '" << name << "' custom op, ParseAttrs function was not set.";
-    CHECK(type  != nullptr) << "Error loading '" << name << "' custom op, InferType function was not set.";
-    CHECK(shape != nullptr) << "Error loading '" << name << "' custom op, InferShape function was not set.";
-    
     LOG(INFO) << "\tOp[" << i << "] " << name;
     std::string name_str(name);
 
@@ -143,35 +153,35 @@ int MXLoadLib(const char *path) {
      * registered operators. 
      */
     
-    //lambda function to call parse attributes
+    // lambda function to call parse attributes
     auto attr_parser = [=](const NodeAttrs* attrs) {
-      //convert attributes to vector of char
+      // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs->dict) {
+      for (auto kv : attrs->dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
       
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs for custom operator '" << name_str << "'";
 
-      //return type void
+      // return type void
     };
 
-    //lambda function to call parse attributes and return the number of inputs
+    // lambda function to call parse attributes and return the number of inputs
     auto num_inputs = [=](const NodeAttrs& attrs) {
-      //convert attributes to vector of char
+      // convert attributes to vector of char
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
 
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
@@ -179,17 +189,17 @@ int MXLoadLib(const char *path) {
       return num_in;
     };
 
-    //lambda function to call parse attributes and return the number of outputs
+    // lambda function to call parse attributes and return the number of outputs
     auto num_outputs = [=](const NodeAttrs& attrs) {
-      //convert attributes to vector of char*
-      std::vector<const char*> attr_keys,attr_vals;
-      for(auto kv : attrs.dict) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
       
-      int num_in=-1;
-      int num_out=-1;
+      int num_in = -1;
+      int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
@@ -197,13 +207,13 @@ int MXLoadLib(const char *path) {
       return num_out;
     };
 
-    //lambda function to call infer shape
+    // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
                             mxnet::ShapeVector *out_shape) {
-      //convert attributes to vector of char*
+      // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -211,11 +221,11 @@ int MXLoadLib(const char *path) {
       std::vector<uint32_t*> inshapes(in_shape->size());
       std::vector<int> indims(in_shape->size());
 
-      //determine amount of memory needed to store all the input shapes
+      // determine amount of memory needed to store all the input shapes
       size_t buff_size = 0;
       for (const auto& i : *in_shape) buff_size += i.ndim();
 
-      //copy input shapes from ShapeVector to raw memory layout
+      // copy input shapes from ShapeVector to raw memory layout
       std::vector<uint32_t> inbuff(buff_size);
       uint32_t *ptr = inbuff.data();
       for (size_t i = 0; i < in_shape->size(); ++i) {
@@ -236,13 +246,13 @@ int MXLoadLib(const char *path) {
       << "Error calling InferShape for custom operator '" << name_str << "'";
 
       std::vector<uint32_t*> out_shapes(out_shape->size());
-      //determine amount of memory needed to store all the output shapes
+      // determine amount of memory needed to store all the output shapes
       buff_size = 0;
       for (unsigned i=0; i<out_shape->size(); i++) {
         buff_size += outdims[i];
       }
 
-      //copy output shapes from custom op memory to MXNet memory
+      // copy output shapes from custom op memory to MXNet memory
       std::vector<uint32_t> outbuff(buff_size);
       ptr = outbuff.data();
       for (unsigned i = 0; i < out_shape->size(); ++i) {
@@ -258,9 +268,9 @@ int MXLoadLib(const char *path) {
                            mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
       }
 
-      //free memory used by custom op to allocate shapes/dims
+      // free memory used by custom op to allocate shapes/dims
       callFree(outdims);
-      for(unsigned i=0; i<out_shape->size(); i++) {
+      for (unsigned i = 0; i < out_shape->size(); i++) {
         callFree(outshapes[i]);
       }
       callFree(outshapes);
@@ -274,9 +284,9 @@ int MXLoadLib(const char *path) {
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
-      //convert attributes to vector of char*
+      // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
-      for(auto kv : attrs.dict) {
+      for (auto kv : attrs.dict) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
@@ -286,32 +296,34 @@ int MXLoadLib(const char *path) {
       std::vector<int> in_dims, out_dims;
       std::vector<int> in_types, out_types;
 
-      //convert input tensors to constituent parts
-      for(size_t i=0; i<inputs.size(); i++) {
+      // convert input tensors to constituent parts
+      for (size_t i = 0; i < inputs.size(); i++) {
         in_data.push_back(inputs[i].dptr_);
         in_shapes.push_back(inputs[i].shape_.data());
         in_dims.push_back(inputs[i].shape_.ndim());
         in_types.push_back(inputs[i].type_flag_);
       }
 
-      //convert output tensors to constituent parts
-      for(size_t i=0; i<outputs.size(); i++) {
+      // convert output tensors to constituent parts
+      for (size_t i = 0; i < outputs.size(); i++) {
         out_data.push_back(outputs[i].dptr_);
         out_shapes.push_back(outputs[i].shape_.data());
         out_dims.push_back(outputs[i].shape_.ndim());
         out_types.push_back(outputs[i].type_flag_);
       }
 
-      //call fcompute function
+      // call fcompute function
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
-                      in_shapes.data(), in_dims.data(), in_data.data(), in_types.data(), in_data.size(),
-                      out_shapes.data(), out_dims.data(), out_data.data(), out_types.data(), out_data.size()))
+                      in_shapes.data(), in_dims.data(), in_data.data(),
+                      in_types.data(), in_data.size(),
+                      out_shapes.data(), out_dims.data(), out_data.data(),
+                      out_types.data(), out_data.size()))
       << "Error calling FCompute for custom operator '" << name_str << "'";
 
-      //return type void
+      // return type void
     };
 
-    //re-register op in MXNet using lambda converter functions
+    // re-register op in MXNet using lambda converter functions
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
     regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);

From 0ee56c90ab2528937d4886547c7a575184083e5b Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 08:00:02 +0000
Subject: [PATCH 040/111] fixed whitespace

---
 include/mxnet/lib_api.h | 16 ++++++++--------
 src/c_api/c_api.cc      | 32 ++++++++++++++++----------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 1f17b0344586..0d5fa849d431 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -81,7 +81,7 @@ typedef int (*inferShape_t)(std::map<std::string, std::string>,
  * \brief Class to hold custom operator registration
  */
 class CustomOp {
-  public:
+ public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
     parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
   ~CustomOp() {}
@@ -140,7 +140,7 @@ class Registry {
   T& get(int idx) {
     return *(entries[idx]);
   }
-  
+
  private:
   /*! \brief constructor */
   Registry() {}
@@ -148,7 +148,7 @@ class Registry {
   ~Registry() {}
   /*! \brief map of entries in registry */
   std::vector<T*> entries;
-};  
+};
 
 /*
  * Macros to help with string concat
@@ -281,16 +281,16 @@ extern "C" {
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
     int retval = inferShape(attrs, in_shapes, out_shapes);
-    if(!retval) return retval;
+    if (!retval) return retval;
 
     // allocate space for output dims, shape
-    *outdims = (int*) malloc (num_out * sizeof(int));
-    *outshapes = (unsigned**) malloc (num_out * sizeof(unsigned*));
+    *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
+    *outshapes = static_cast<unsigned**>(malloc (num_out * sizeof(unsigned*)));
 
     // copy output shapes
     for (int i = 0; i < num_out; i++) {
       (*outdims)[i] = out_shapes[i].size();
-      (*outshapes)[i] = (unsigned*) malloc ((*outdims)[i] * sizeof(unsigned));
+      (*outshapes)[i] = static_cast<unsigned*>(malloc ((*outdims)[i] * sizeof(unsigned)));
       for (int j = 0; j < indims[i]; j++) {
         (*outshapes)[i][j] = out_shapes[i][j];
       }
@@ -336,7 +336,7 @@ extern "C" {
 
     return fcomp(attrs, inputs, outputs);
   }
-  
+
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5ebdb8dc8c38..d49be1662d09 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -107,13 +107,13 @@ int MXLoadLib(const char *path) {
 
   // get call functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
-  
+
   opCallParseAttrs_t callParseAttrs =
     get_func<opCallParseAttrs_t>(lib, const_cast<char*>(MXLIB_OPCALLPARSEATTRS_STR));
-  
+
   opCallInferShape_t callInferShape =
     get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
-  
+
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
@@ -152,7 +152,7 @@ int MXLoadLib(const char *path) {
      * Each one has the standard MXNet signature and converts to types supported by externally
      * registered operators. 
      */
-    
+
     // lambda function to call parse attributes
     auto attr_parser = [=](const NodeAttrs* attrs) {
       // convert attributes to vector of char
@@ -161,7 +161,7 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       int num_in = -1;
       int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
@@ -185,7 +185,7 @@ int MXLoadLib(const char *path) {
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
-      
+
       return num_in;
     };
 
@@ -197,13 +197,13 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       int num_in = -1;
       int num_out = -1;
       CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
-      
+
       return num_out;
     };
 
@@ -236,7 +236,7 @@ int MXLoadLib(const char *path) {
         }
       }
 
-      //output shapes will be allocated by infer shape function
+      // output shapes will be allocated by infer shape function
       uint32_t** outshapes = nullptr;
       int* outdims = nullptr;
 
@@ -248,7 +248,7 @@ int MXLoadLib(const char *path) {
       std::vector<uint32_t*> out_shapes(out_shape->size());
       // determine amount of memory needed to store all the output shapes
       buff_size = 0;
-      for (unsigned i=0; i<out_shape->size(); i++) {
+      for (unsigned i = 0; i < out_shape->size(); i++) {
         buff_size += outdims[i];
       }
 
@@ -262,7 +262,7 @@ int MXLoadLib(const char *path) {
         }
       }
 
-      //assign output shapes to ShapeVector
+      // assign output shapes to ShapeVector
       for (unsigned i = 0; i < out_shape->size(); ++i) {
         SHAPE_ASSIGN_CHECK(*out_shape, i,
                            mxnet::TShape(out_shapes[i], out_shapes[i]+outdims[i]));
@@ -274,10 +274,10 @@ int MXLoadLib(const char *path) {
         callFree(outshapes[i]);
       }
       callFree(outshapes);
-      
+
       return true;
     };
-    
+
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -290,7 +290,7 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
-      
+
       std::vector<void*> in_data, out_data;
       std::vector<const int64_t *> in_shapes, out_shapes;
       std::vector<int> in_dims, out_dims;
@@ -330,9 +330,9 @@ int MXLoadLib(const char *path) {
     regOp.set_num_outputs(num_outputs);
     regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
     regOp.add_argument("data", "NDArray[]", "Source inputs");
-    regOp.set_attr<FCompute>("FCompute<cpu>",fcomp_conv);      
+    regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
   }
-  
+
   API_END();
 }
 

From adc977033957208686ecb058b78970bb08a27595 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 20 Aug 2019 08:04:35 +0000
Subject: [PATCH 041/111] fixed whitespace

---
 python/mxnet/library.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mxnet/library.py b/python/mxnet/library.py
index fcf39815d8e9..8ea0bc2ae0a5 100644
--- a/python/mxnet/library.py
+++ b/python/mxnet/library.py
@@ -59,12 +59,12 @@ def load(path):
     mx_nd = sys.modules["mxnet.ndarray"]
     mx_nd_op = sys.modules["mxnet.ndarray.op"]
     for op in dir(mx_nd_op):
-        func = getattr(mx_nd_op,op)
-        setattr(mx_nd,op,func)
+        func = getattr(mx_nd_op, op)
+        setattr(mx_nd, op, func)
 
     #re-register mx.sym.op into mx.sym
     mx_sym = sys.modules["mxnet.symbol"]
     mx_sym_op = sys.modules["mxnet.symbol.op"]
     for op in dir(mx_sym_op):
-        func = getattr(mx_sym_op,op)
-        setattr(mx_sym,op,func)
+        func = getattr(mx_sym_op, op)
+        setattr(mx_sym, op, func)

From 5c06d476f3e37c793de74718489e89c5624c7678 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 23 Aug 2019 19:10:53 +0000
Subject: [PATCH 042/111] added temporary support for operator
 multi-registration

---
 src/c_api/c_api.cc | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d49be1662d09..251b4aa83f51 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -323,14 +323,35 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
-    // re-register op in MXNet using lambda converter functions
-    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-    regOp.set_attr_parser(attr_parser);
-    regOp.set_num_inputs(num_inputs);
-    regOp.set_num_outputs(num_outputs);
-    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-    regOp.add_argument("data", "NDArray[]", "Source inputs");
-    regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+    //check if operator is already registered
+    const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
+    if(regOpPtr == nullptr) {
+      // re-register op in MXNet using lambda converter functions
+      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+      regOp.set_attr_parser(attr_parser);
+      regOp.set_num_inputs(num_inputs);
+      regOp.set_num_outputs(num_outputs);
+
+      regOp.add_argument("data", "NDArray[]", "Source inputs");
+
+      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
+      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+    } else {
+      //overwrite registration of existing op with custom op
+      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+
+      regOp.set_attr_parser(attr_parser);
+      regOp.set_num_inputs(num_inputs);
+      regOp.set_num_outputs(num_outputs);
+
+      regOp.arguments.clear();
+      regOp.add_argument("data", "NDArray[]", "Source inputs");
+
+      //set attribute with higher plevel (11) to allow re-registering once
+      //TODO: enable constant overwriting of registertion multiple times
+      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
+      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
+    }
   }
 
   API_END();

From 91368390d3df8104c62934fe0dc4b5029dbfff37 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 23 Aug 2019 19:17:38 +0000
Subject: [PATCH 043/111] insanity checked

---
 src/c_api/c_api.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 251b4aa83f51..dd68184f0409 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -325,7 +325,7 @@ int MXLoadLib(const char *path) {
 
     //check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
-    if(regOpPtr == nullptr) {
+    if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
       regOp.set_attr_parser(attr_parser);
@@ -337,7 +337,7 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
     } else {
-      //overwrite registration of existing op with custom op
+      // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
 
       regOp.set_attr_parser(attr_parser);
@@ -347,8 +347,8 @@ int MXLoadLib(const char *path) {
       regOp.arguments.clear();
       regOp.add_argument("data", "NDArray[]", "Source inputs");
 
-      //set attribute with higher plevel (11) to allow re-registering once
-      //TODO: enable constant overwriting of registertion multiple times
+      // set attribute with higher plevel (11) to allow re-registering once
+      // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
     }

From ffe7623fa6a19a691d836b567f21416d46be71ad Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 20:06:04 +0000
Subject: [PATCH 044/111] update docblocks

---
 example/lib_ops/libtest.cc |  2 +-
 example/lib_ops/mylib.cc   |  5 +++--
 example/lib_ops/test.py    |  1 +
 include/mxnet/lib_api.h    |  5 ++++-
 python/mxnet/__init__.py   |  1 +
 src/c_api/c_api.cc         | 10 ++++++++--
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/example/lib_ops/libtest.cc b/example/lib_ops/libtest.cc
index 8bdf36c05d37..9fcdda55c64f 100644
--- a/example/lib_ops/libtest.cc
+++ b/example/lib_ops/libtest.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file libtest.cc
  * \brief This test checks if the library is implemented correctly
  * and does not involve dynamic loading of library into MXNet
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 5916c9d46683..f00b138c66dc 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -18,9 +18,10 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file mylib.cc
- * \brief Sample library file
+ * \brief Sample custom operator implementation
+ * library file
  */
 
 #include <iostream>
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index cdf78bc5c9c3..b9d5aeb49340 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -21,6 +21,7 @@
 # pylint: disable=arguments-differ
 
 # This test checks if dynamic loading of library into MXNet is successful
+# and checks the end of end computation of custom operator
 
 import mxnet as mx
 import os
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 0d5fa849d431..c36868adf1f8 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -18,10 +18,13 @@
  */
 
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file lib_api.h
  * \brief APIs to interact with libraries
+ * This API specifies function prototypes to
+ * register custom ops for library authors
  */
+
 #ifndef MXNET_LIB_API_H_
 #define MXNET_LIB_API_H_
 
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index c76d78f54331..87beb23a8d2b 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -86,6 +86,7 @@
 
 from . import gluon
 
+# Dynamic library module should be done after ndarray and symbol are initialized
 from . import library
 
 __version__ = base.__version__
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index dd68184f0409..27e126fa9b80 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -93,7 +93,10 @@ inline int MXAPIGetFunctionRegInfo(const FunRegType *e,
 
 // NOTE: return value is added in API_END
 
-// Loads library and initializes it
+/*!
+ * \brief Loads dynamic library and initializes it
+ * \param path library path
+ */
 int MXLoadLib(const char *path) {
   API_BEGIN();
   void *lib = LibraryInitializer::Get()->lib_load(path);
@@ -122,7 +125,10 @@ int MXLoadLib(const char *path) {
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
-  // loop and register each operator in the library
+  /*
+   * The library has custom operators implementation
+   * loop and register each operator in the library to NNVM
+   */
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for (int i = 0; i < numOps; i++) {
     const char* name;

From 435e01e30357a8681571b0dd73f5839a6949fcdd Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 20:10:21 +0000
Subject: [PATCH 045/111] small format fix

---
 src/c_api/c_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 27e126fa9b80..9b5549c41890 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -329,7 +329,7 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
-    //check if operator is already registered
+    // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions

From 0de79a99cf221629431a0a2f70bc57e0f815f21f Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 23 Aug 2019 23:00:24 +0000
Subject: [PATCH 046/111] fix unittest with correct library

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5c06dc800c4b..1edd0e14926a 100644
--- a/Makefile
+++ b/Makefile
@@ -662,7 +662,7 @@ pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 sample_lib:
-	$(CXX) -shared -fPIC -std=c++11 example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/mylib.cc -o libsample_lib.so -I include/mxnet
 
 doc: docs
 

From 0d6f7b0c58e6ad505b49447bff58c35bb21ab046 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Tue, 27 Aug 2019 02:53:21 +0000
Subject: [PATCH 047/111] implement InferType

---
 example/lib_ops/mylib.cc | 27 +++++++++++++++++++++------
 include/mxnet/lib_api.h  | 40 +++++++++++++++++++++++++++++++++++++++-
 src/c_api/c_api.cc       | 40 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index f00b138c66dc..4098e38aa06c 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -83,25 +83,39 @@ int parseAttrs(std::map<std::string,std::string> attrs,
 
 int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
               std::vector<int> &outtypes) {
+  // validate inputs
+  if (intypes.size() != 2) {
+    std::cout << "Expected 2 inputs to inferType" << std::endl;
+    return 0;
+  }
+
+  if (intypes[0] != intypes[1]) {
+    std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
+    return 0;
+  }
+
   outtypes[0] = intypes[0];
-  
+
+  std::cout << "intypes[0]=" << intypes[0] << "  outtypes[0]=" << outtypes[0] << std::endl;
+  std::cout << "intypes=" << intypes.size() << "  outtypes=" << outtypes.size() << std::endl;
+
   return 1; //no error
 }
 
 int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
-  //validate inputs
-  if(inshapes.size() != 2) {
+  // validate inputs
+  if (inshapes.size() != 2) {
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
     return 0;
   }
 
-  if(inshapes[0].size() != 2) {
+  if (inshapes[0].size() != 2) {
     std::cout << "Expected 2D for first input to inferShape" << std::endl;
     return 0;
   }
 
-  if(inshapes[1].size() != 2) {
+  if (inshapes[1].size() != 2) {
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
     return 0;
   }
@@ -114,7 +128,8 @@ int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<
   std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
   
-  if(k != kk) return 0;
+  if (k != kk)
+    return 0;
   
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c36868adf1f8..24ec22fc42fe 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -207,6 +207,10 @@ typedef int (*opCallInferShape_t)(inferShape_t, const char* const*, const char*
                                   unsigned int**, int*, int,
                                   unsigned int***, int**, int);
 
+#define MXLIB_OPCALLINFERTYPE_STR "_opCallInferType"
+typedef int (*opCallInferType_t)(inferType_t, const char* const*, const char* const*, int,
+                                  int*, int, int*, int);
+
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
@@ -284,7 +288,8 @@ extern "C" {
     std::vector<std::vector<unsigned int> > out_shapes(num_out);
 
     int retval = inferShape(attrs, in_shapes, out_shapes);
-    if (!retval) return retval;
+    if (!retval)
+      return retval;
 
     // allocate space for output dims, shape
     *outdims = static_cast<int*>(malloc (num_out * sizeof(int)));
@@ -302,6 +307,39 @@ extern "C" {
     return retval;
   }
 
+  /*!
+   * \brief returns status of calling InferType function for operator from library
+   */
+  int _opCallInferType(inferType_t inferType, const char* const* keys,
+                        const char* const* vals, int num,
+                        int* intypes, int num_in, int* outtypes, int num_out) {
+    //create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of types for inputs
+    std::vector<int> in_types(num_in);
+    for (int i = 0; i < num_in; i++) {
+      in_types[i] = intypes[i];
+    }
+
+    // create a vector of types for outputs
+    std::vector<int> out_types(num_out);
+
+    int retval = inferType(attrs, in_types, out_types);
+    if (!retval)
+      return retval;
+
+    // copy output types
+    for (int i = 0; i < num_out; i++) {
+      outtypes[i] = out_types[i];
+    }
+
+    return retval;
+  }
+
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 9b5549c41890..d5b69b9ff611 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -117,6 +117,9 @@ int MXLoadLib(const char *path) {
   opCallInferShape_t callInferShape =
     get_func<opCallInferShape_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERSHAPE_STR));
 
+  opCallInferType_t callInferType =
+    get_func<opCallInferType_t>(lib, const_cast<char*>(MXLIB_OPCALLINFERTYPE_STR));
+
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
@@ -137,10 +140,10 @@ int MXLoadLib(const char *path) {
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
 
-    // get operator from the library
+    // get custom operator implemenation from the dynamic library
     opRegGet(i, &name, &fcomp, &parse, &type, &shape);
 
-    // validate operator functions from the library
+    // validate custom operator functions from the dynamic library
     CHECK(fcomp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
     CHECK(parse != nullptr) << "Error loading '" << name
@@ -284,6 +287,36 @@ int MXLoadLib(const char *path) {
       return true;
     };
 
+    // lambda function to call infer type
+    auto infer_type = [=] (const nnvm::NodeAttrs& attrs,
+                            std::vector<int> *in_type,
+                            std::vector<int> *out_type) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      // copy input types from in_type
+      std::vector<int> intypes(*in_type);
+
+      // output types will be populated by inferType function
+      std::vector<int> outtypes(out_type->size());
+
+      CHECK(callInferType(type, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           intypes.data(), in_type->size(),
+                           outtypes.data(), out_type->size()))
+      << "Error calling InferType for custom operator '" << name_str << "'";
+
+      // copy and assign output types from custom op to MXNet memory
+      for (size_t i = 0; i < out_type->size(); i++) {
+        TYPE_ASSIGN_CHECK(*out_type, i, outtypes[i]);
+      }
+
+      return true;
+    };
+
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -339,7 +372,7 @@ int MXLoadLib(const char *path) {
       regOp.set_num_outputs(num_outputs);
 
       regOp.add_argument("data", "NDArray[]", "Source inputs");
-
+      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
     } else {
@@ -355,6 +388,7 @@ int MXLoadLib(const char *path) {
 
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
+      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
     }

From 18b028eb4f4b5ae244e85ec7cd9bde39fece4286 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 27 Aug 2019 18:46:21 +0000
Subject: [PATCH 048/111] initial support for resource manager, temp space

---
 example/lib_ops/mylib.cc |  5 +++--
 include/mxnet/lib_api.h  | 38 +++++++++++++++++++++++++++++++++-----
 src/c_api/c_api.cc       | 27 +++++++++++++++++++++++++--
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 4098e38aa06c..fea5b84e30a6 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -44,7 +44,8 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 
 
 int myFCompute(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs) {
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
@@ -52,7 +53,7 @@ int myFCompute(std::map<std::string,std::string> attrs,
       return 0;
     }
   }
-
+  
   //extract data pointers from tensors
   float* input1 = inputs[0].getData<float>();
   float* input2 = inputs[1].getData<float>();
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 24ec22fc42fe..b05f3ab936d1 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -67,11 +67,35 @@ struct MXTensor {
   MXDType dtype;
 };
 
+/*!
+ * \brief resource malloc function to allocate memory inside fcompute function
+ */
+typedef void* (*xpu_malloc_t)(void*,int);
+
+/*!
+ * \brief Class to provide resource APIs to FCompute
+ */
+class OpResource {
+ public:
+  OpResource(xpu_malloc_t xm, void* _xm) : xpu_malloc(xm), _xpu_malloc(_xm) {}
+
+  /*!
+   * \brief allocate memory controlled by MXNet
+   */
+  void* alloc(int size) {
+    return xpu_malloc(_xpu_malloc,size);
+  }
+ private:
+  xpu_malloc_t xpu_malloc;
+  void* _xpu_malloc;
+};
+
 /*!
  * Custom Operator function templates
  */
 typedef int (*fcomp_t)(std::map<std::string, std::string>,
-                       std::vector<MXTensor>, std::vector<MXTensor>);
+                       std::vector<MXTensor>, std::vector<MXTensor>,
+                       OpResource res);
 typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
                             int*, int*);
 typedef int (*inferType_t)(std::map<std::string, std::string>,
@@ -214,7 +238,8 @@ typedef int (*opCallInferType_t)(inferType_t, const char* const*, const char* co
 #define MXLIB_OPCALLFCOMP_STR "_opCallFCompute"
 typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, int,
                              const int64_t**, int*, void**, int*, int,
-                             const int64_t**, int*, void**, int*, int);
+                             const int64_t**, int*, void**, int*, int,
+                             xpu_malloc_t, void*);
 
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
@@ -247,7 +272,7 @@ extern "C" {
   void _opCallFree(void* ptr) {
     free(ptr);
   }
-
+  
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
@@ -348,7 +373,8 @@ extern "C" {
                       const int64_t** inshapes, int* indims,
                       void** indata, int* intypes, int num_in,
                       const int64_t** outshapes, int* outdims,
-                      void** outdata, int* outtypes, int num_out) {
+                      void** outdata, int* outtypes, int num_out,
+                      xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -375,7 +401,9 @@ extern "C" {
       }
     }
 
-    return fcomp(attrs, inputs, outputs);
+    OpResource res(xpu_malloc,_xpu_malloc);
+    
+    return fcomp(attrs, inputs, outputs, res);
   }
 
   /*!
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d5b69b9ff611..682695160d90 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -351,12 +351,32 @@ int MXLoadLib(const char *path) {
         out_types.push_back(outputs[i].type_flag_);
       }
 
+      // get memory resource
+      const Resource &resource = ctx.requested[0];
+      mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
+
+      // create lambda that captures stream & resource objects
+      auto cpu_alloc = [&](int size) {
+        mshadow::Tensor<mxnet::cpu, 1, char> data = resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size),cpu_stream);
+        return data.dptr_;
+      };
+
+      // create lambda without captures so that we can cast it to function pointer
+      // this needs to be a lambda function so that we can do the decltype cast
+      auto cpu_malloc = [](void* _cpu_alloc, int size) {
+        // cast the void* argument to the type for the cpu_alloc lambda function
+        decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
+        
+        void* ptr = (*cpualloc)(size);
+        return ptr;
+      };
+
       // call fcompute function
       CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(),
                       in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(),
-                      out_types.data(), out_data.size()))
+                      out_types.data(), out_data.size(), cpu_malloc, &cpu_alloc))
       << "Error calling FCompute for custom operator '" << name_str << "'";
 
       // return type void
@@ -370,7 +390,10 @@ int MXLoadLib(const char *path) {
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
-
+      regOp.set_attr<FResourceRequest>("FResourceRequest",
+                                       [](const NodeAttrs& attrs) {
+                                         return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                       });
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);

From a4690b4135623a67ea2ef473b157450e533f262c Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 27 Aug 2019 20:13:14 +0000
Subject: [PATCH 049/111] fixed formatting

---
 include/mxnet/lib_api.h | 12 ++++++------
 src/c_api/c_api.cc      |  8 +++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b05f3ab936d1..a0ddd56f03fd 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -70,7 +70,7 @@ struct MXTensor {
 /*!
  * \brief resource malloc function to allocate memory inside fcompute function
  */
-typedef void* (*xpu_malloc_t)(void*,int);
+typedef void* (*xpu_malloc_t)(void*, int);
 
 /*!
  * \brief Class to provide resource APIs to FCompute
@@ -83,7 +83,7 @@ class OpResource {
    * \brief allocate memory controlled by MXNet
    */
   void* alloc(int size) {
-    return xpu_malloc(_xpu_malloc,size);
+    return xpu_malloc(_xpu_malloc, size);
   }
  private:
   xpu_malloc_t xpu_malloc;
@@ -272,7 +272,7 @@ extern "C" {
   void _opCallFree(void* ptr) {
     free(ptr);
   }
-  
+
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
@@ -338,7 +338,7 @@ extern "C" {
   int _opCallInferType(inferType_t inferType, const char* const* keys,
                         const char* const* vals, int num,
                         int* intypes, int num_in, int* outtypes, int num_out) {
-    //create map of attributes from list
+    // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
       attrs[std::string(keys[i])] = std::string(vals[i]);
@@ -401,8 +401,8 @@ extern "C" {
       }
     }
 
-    OpResource res(xpu_malloc,_xpu_malloc);
-    
+    OpResource res(xpu_malloc, _xpu_malloc);
+
     return fcomp(attrs, inputs, outputs, res);
   }
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 682695160d90..aa1712dc7424 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -357,7 +357,8 @@ int MXLoadLib(const char *path) {
 
       // create lambda that captures stream & resource objects
       auto cpu_alloc = [&](int size) {
-        mshadow::Tensor<mxnet::cpu, 1, char> data = resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size),cpu_stream);
+        mshadow::Tensor<mxnet::cpu, 1, char> data =
+        resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
         return data.dptr_;
       };
 
@@ -366,7 +367,7 @@ int MXLoadLib(const char *path) {
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
         decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
-        
+
         void* ptr = (*cpualloc)(size);
         return ptr;
       };
@@ -392,7 +393,8 @@ int MXLoadLib(const char *path) {
       regOp.set_num_outputs(num_outputs);
       regOp.set_attr<FResourceRequest>("FResourceRequest",
                                        [](const NodeAttrs& attrs) {
-                                         return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                         return std::vector<ResourceRequest>{
+                                           ResourceRequest::kTempSpace};
                                        });
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);

From c901828d8264afe52aad2b0db140cf229960b94e Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 05:56:35 +0000
Subject: [PATCH 050/111] changed decltype to typedef

---
 src/c_api/c_api.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index aa1712dc7424..bf30c4e425b5 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -362,11 +362,13 @@ int MXLoadLib(const char *path) {
         return data.dptr_;
       };
 
+      typedef decltype(cpu_alloc) alloc_type;
+      
       // create lambda without captures so that we can cast it to function pointer
       // this needs to be a lambda function so that we can do the decltype cast
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
-        decltype(cpu_alloc)* cpualloc = static_cast<decltype(cpu_alloc)*>(_cpu_alloc);
+        alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
 
         void* ptr = (*cpualloc)(size);
         return ptr;

From 5ddb91983e32507c31833ca3051c59643969a5bd Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 06:01:20 +0000
Subject: [PATCH 051/111] fixed whitespace

---
 src/c_api/c_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index bf30c4e425b5..86d72c4fc881 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -363,7 +363,7 @@ int MXLoadLib(const char *path) {
       };
 
       typedef decltype(cpu_alloc) alloc_type;
-      
+
       // create lambda without captures so that we can cast it to function pointer
       // this needs to be a lambda function so that we can do the decltype cast
       auto cpu_malloc = [](void* _cpu_alloc, int size) {

From 7b4c4e6508296207ffb3f5cd12f61c52c9d5af1b Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 07:39:17 +0000
Subject: [PATCH 052/111] Added windows declaration types, change APIs to
 return MXReturnValue instead of int

---
 example/lib_ops/mylib.cc | 36 +++++++++----------
 include/mxnet/lib_api.h  | 77 ++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 36 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index fea5b84e30a6..ac2bd974a94d 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -43,14 +43,14 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 }
 
 
-int myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
       std::cout << "Expected input " << i << " to have float32 type" << std::endl;
-      return 0;
+      return MX_FAIL;
     }
   }
   
@@ -65,10 +65,10 @@ int myFCompute(std::map<std::string,std::string> attrs,
 
   gemm(input1, input2, output, n, k, m);
   
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int parseAttrs(std::map<std::string,std::string> attrs,
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   /*
   if(attrs.find("myParam") == attrs.end()) {
@@ -79,20 +79,20 @@ int parseAttrs(std::map<std::string,std::string> attrs,
   *num_in = 2;
   *num_out = 1;
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
               std::vector<int> &outtypes) {
   // validate inputs
   if (intypes.size() != 2) {
     std::cout << "Expected 2 inputs to inferType" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (intypes[0] != intypes[1]) {
     std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   outtypes[0] = intypes[0];
@@ -100,25 +100,25 @@ int inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes
   std::cout << "intypes[0]=" << intypes[0] << "  outtypes[0]=" << outtypes[0] << std::endl;
   std::cout << "intypes=" << intypes.size() << "  outtypes=" << outtypes.size() << std::endl;
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
-int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
   // validate inputs
   if (inshapes.size() != 2) {
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (inshapes[0].size() != 2) {
     std::cout << "Expected 2D for first input to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 
   if (inshapes[1].size() != 2) {
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
   
   unsigned n = inshapes[0][0];
@@ -130,12 +130,12 @@ int inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
   
   if (k != kk)
-    return 0;
+    return MX_FAIL;
   
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
 
-  return 1; //no error
+  return MX_SUCCESS;
 }
 
 REGISTER_OP(sam)
@@ -144,13 +144,13 @@ REGISTER_OP(sam)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
-int initialize(int version) {
+MXReturnValue initialize(int version) {
   if (version >= 10400) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
-    return 1;
+    return MX_SUCCESS;
   } else {
     std::cout << "MXNet version " << version << " not supported" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 }
 
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a0ddd56f03fd..c8fa635fa218 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -90,19 +90,24 @@ class OpResource {
   void* _xpu_malloc;
 };
 
+enum MXReturnValue {
+  MX_FAIL = 0,
+  MX_SUCCESS = 1,
+};
+
 /*!
  * Custom Operator function templates
  */
-typedef int (*fcomp_t)(std::map<std::string, std::string>,
-                       std::vector<MXTensor>, std::vector<MXTensor>,
-                       OpResource res);
-typedef int (*parseAttrs_t)(std::map<std::string, std::string>,
-                            int*, int*);
-typedef int (*inferType_t)(std::map<std::string, std::string>,
-                           std::vector<int>&, std::vector<int>&);
-typedef int (*inferShape_t)(std::map<std::string, std::string>,
-                            std::vector<std::vector<unsigned int>>&,
-                            std::vector<std::vector<unsigned int>>&);
+typedef MXReturnValue (*fcomp_t)(std::map<std::string, std::string>,
+                                 std::vector<MXTensor>, std::vector<MXTensor>,
+                                 OpResource res);
+typedef MXReturnValue (*parseAttrs_t)(std::map<std::string, std::string>,
+                                      int*, int*);
+typedef MXReturnValue (*inferType_t)(std::map<std::string, std::string>,
+                                     std::vector<int>&, std::vector<int>&);
+typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
+                                      std::vector<std::vector<unsigned int>>&,
+                                      std::vector<std::vector<unsigned int>>&);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -248,14 +253,24 @@ extern "C" {
   /*!
    * \brief returns number of ops registered in this library
    */
-  int _opRegSize() {
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opRegSize
+#else
+  int _opRegSize
+#endif
+  () {
     return Registry<CustomOp>::get()->size();
   }
 
   /*!
    * \brief returns operator registration at specified index
    */
-  void _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) void __cdecl _opRegGet
+#else
+    void _opRegGet
+#endif
+    (int idx, const char** name, fcomp_t* fcomp,
                  parseAttrs_t* parse, inferType_t* type,
                  inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
@@ -269,14 +284,24 @@ extern "C" {
   /*!
    * \brief calls free from the external library for library allocated arrays
    */
-  void _opCallFree(void* ptr) {
+  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) void __cdecl _opCallFree
+#else
+  void _opCallFree
+#endif
+  (void* ptr) {
     free(ptr);
   }
 
   /*!
    * \brief returns status of calling parse attributes function for operator from library
    */
-  int _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallParseAttrs
+#else
+  int _opCallParseAttrs
+#endif
+  (parseAttrs_t parseAttrs, const char* const* keys,
                         const char* const* vals, int num,
                         int* num_in, int* num_out) {
     // create map of attributes from list
@@ -291,7 +316,12 @@ extern "C" {
   /*!
    * \brief returns status of calling infer shape function for operator from library
    */
-  int _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallInferShape
+#else
+  int _opCallInferShape
+#endif
+  (inferShape_t inferShape, const char* const* keys,
                         const char* const* vals, int num,
                         unsigned int** inshapes, int* indims, int num_in,
                         unsigned int*** outshapes, int** outdims, int num_out) {
@@ -335,7 +365,12 @@ extern "C" {
   /*!
    * \brief returns status of calling InferType function for operator from library
    */
-  int _opCallInferType(inferType_t inferType, const char* const* keys,
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallInferType
+#else
+  int _opCallInferType
+#endif
+  (inferType_t inferType, const char* const* keys,
                         const char* const* vals, int num,
                         int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
@@ -368,7 +403,13 @@ extern "C" {
   /*!
    * \brief returns status of calling FCompute function for operator from library
    */
-  int _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl _opCallFCompute
+#else
+  int _opCallFCompute
+#endif
+  (fcomp_t fcomp, const char* const* keys,
                       const char* const* vals, int num,
                       const int64_t** inshapes, int* indims,
                       void** indata, int* intypes, int num_in,
@@ -416,7 +457,7 @@ extern "C" {
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl initialize(int);
 #else
-  int initialize(int);
+  MXReturnValue initialize(int);
 #endif
 }
 #endif  // MXNET_LIB_API_H_

From 18117ecf1f281c6e6e5313b2856bc8d4c4fdf751 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 07:59:52 +0000
Subject: [PATCH 053/111] added library version number, API to get, and check
 to validate

---
 include/mxnet/lib_api.h | 94 ++++++++++++++++++++++++-----------------
 src/c_api/c_api.cc      |  7 +++
 2 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c8fa635fa218..4801282ee13b 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -33,6 +33,8 @@
 #include <map>
 #include <string>
 
+#define MX_LIBRARY_VERSION 1
+
 /*!
  * \brief External Tensor data types
  */
@@ -249,16 +251,31 @@ typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, in
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
+#define MXLIB_OPVERSION_STR "_opVersion"
+typedef int (*opVersion_t)();
+
 extern "C" {
+  /*!
+   * \brief returns MXNet library version 
+   */
+  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opVersion () {
+    return MX_LIBRARY_VERSION;
+  }
+
   /*!
    * \brief returns number of ops registered in this library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opRegSize
+  __declspec(dllexport) int __cdecl
 #else
-  int _opRegSize
+  int
 #endif
-  () {
+  _opRegSize () {
     return Registry<CustomOp>::get()->size();
   }
 
@@ -266,13 +283,13 @@ extern "C" {
    * \brief returns operator registration at specified index
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) void __cdecl _opRegGet
+  __declspec(dllexport) void __cdecl
 #else
-    void _opRegGet
+  void
 #endif
-    (int idx, const char** name, fcomp_t* fcomp,
-                 parseAttrs_t* parse, inferType_t* type,
-                 inferShape_t* shape) {
+  _opRegGet (int idx, const char** name, fcomp_t* fcomp,
+             parseAttrs_t* parse, inferType_t* type,
+             inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -285,11 +302,11 @@ extern "C" {
    * \brief calls free from the external library for library allocated arrays
    */
   #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) void __cdecl _opCallFree
+  __declspec(dllexport) void __cdecl
 #else
-  void _opCallFree
+  void
 #endif
-  (void* ptr) {
+  _opCallFree (void* ptr) {
     free(ptr);
   }
 
@@ -297,13 +314,13 @@ extern "C" {
    * \brief returns status of calling parse attributes function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallParseAttrs
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallParseAttrs
+  int
 #endif
-  (parseAttrs_t parseAttrs, const char* const* keys,
-                        const char* const* vals, int num,
-                        int* num_in, int* num_out) {
+  _opCallParseAttrs (parseAttrs_t parseAttrs, const char* const* keys,
+                     const char* const* vals, int num,
+                     int* num_in, int* num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -317,14 +334,14 @@ extern "C" {
    * \brief returns status of calling infer shape function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallInferShape
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallInferShape
+  int
 #endif
-  (inferShape_t inferShape, const char* const* keys,
-                        const char* const* vals, int num,
-                        unsigned int** inshapes, int* indims, int num_in,
-                        unsigned int*** outshapes, int** outdims, int num_out) {
+  _opCallInferShape (inferShape_t inferShape, const char* const* keys,
+                     const char* const* vals, int num,
+                     unsigned int** inshapes, int* indims, int num_in,
+                     unsigned int*** outshapes, int** outdims, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -366,13 +383,13 @@ extern "C" {
    * \brief returns status of calling InferType function for operator from library
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallInferType
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallInferType
+  int
 #endif
-  (inferType_t inferType, const char* const* keys,
-                        const char* const* vals, int num,
-                        int* intypes, int num_in, int* outtypes, int num_out) {
+  _opCallInferType (inferType_t inferType, const char* const* keys,
+                    const char* const* vals, int num,
+                    int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -405,17 +422,17 @@ extern "C" {
    */
 
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl _opCallFCompute
+  __declspec(dllexport) int __cdecl
 #else
-  int _opCallFCompute
+  int
 #endif
-  (fcomp_t fcomp, const char* const* keys,
-                      const char* const* vals, int num,
-                      const int64_t** inshapes, int* indims,
-                      void** indata, int* intypes, int num_in,
-                      const int64_t** outshapes, int* outdims,
-                      void** outdata, int* outtypes, int num_out,
-                      xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
+  _opCallFCompute (fcomp_t fcomp, const char* const* keys,
+                   const char* const* vals, int num,
+                   const int64_t** inshapes, int* indims,
+                   void** indata, int* intypes, int num_in,
+                   const int64_t** outshapes, int* outdims,
+                   void** outdata, int* outtypes, int num_out,
+                   xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -455,9 +472,10 @@ extern "C" {
    * \return Non-zero value on error i.e. library incompatible with passed MXNet version
    */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl initialize(int);
+  __declspec(dllexport) MXReturnValue __cdecl
 #else
-  MXReturnValue initialize(int);
+  MXReturnValue
 #endif
+  initialize(int version);
 }
 #endif  // MXNET_LIB_API_H_
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 86d72c4fc881..bc432b91c333 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -103,6 +103,13 @@ int MXLoadLib(const char *path) {
   if (!lib)
     LOG(FATAL) << "Unable to load library";
 
+  // check that library and MXNet use same version of library API
+  opVersion_t opVersion = get_func<opVersion_t>(lib, const_cast<char*>(MXLIB_OPVERSION_STR));
+  int libVersion =  opVersion();
+  if (MX_LIBRARY_VERSION != libVersion)
+    LOG(FATAL) << "Library version (" << libVersion << ") does not match MXNet version ("
+               << MX_LIBRARY_VERSION << ")";
+
   // initialize library by passing MXNet version
   initialize_t initialize = get_func<initialize_t>(lib, const_cast<char*>(MXLIB_INITIALIZE_STR));
   if (!initialize(static_cast<int>(MXNET_VERSION)))

From ee65419870d31dfbaa4bfdf98855a5ba7176cef2 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Thu, 29 Aug 2019 08:15:31 +0000
Subject: [PATCH 054/111] Changed CMakeLists to build lib_ops instead of
 lib_api, updated lib_api example, fixed whitespace

---
 CMakeLists.txt           |  2 +-
 example/lib_api/Makefile |  2 +-
 example/lib_api/mylib.cc |  6 ++---
 include/mxnet/lib_api.h  | 48 ++++++++++++++++++++--------------------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33529747f44c..bcdd64b6eddd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -695,7 +695,7 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_api/mylib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_ops/mylib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
diff --git a/example/lib_api/Makefile b/example/lib_api/Makefile
index e5893c8065c4..a811f2250b3e 100644
--- a/example/lib_api/Makefile
+++ b/example/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
+	g++ -std=c++11 -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
diff --git a/example/lib_api/mylib.cc b/example/lib_api/mylib.cc
index e67560a87f3d..048642332f16 100644
--- a/example/lib_api/mylib.cc
+++ b/example/lib_api/mylib.cc
@@ -26,12 +26,12 @@
 #include <iostream>
 #include "lib_api.h"
 
-int initialize(int version) {
+MXReturnValue initialize(int version) {
   if (version >= 10400) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
-    return 1;
+    return MX_SUCCESS;
   } else {
     std::cout << "MXNet version " << version << " not supported" << std::endl;
-    return 0;
+    return MX_FAIL;
   }
 }
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 4801282ee13b..f0256cc616c8 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -263,7 +263,7 @@ extern "C" {
 #else
   int
 #endif
-  _opVersion () {
+  _opVersion() {
     return MX_LIBRARY_VERSION;
   }
 
@@ -275,7 +275,7 @@ extern "C" {
 #else
   int
 #endif
-  _opRegSize () {
+  _opRegSize() {
     return Registry<CustomOp>::get()->size();
   }
 
@@ -287,9 +287,9 @@ extern "C" {
 #else
   void
 #endif
-  _opRegGet (int idx, const char** name, fcomp_t* fcomp,
-             parseAttrs_t* parse, inferType_t* type,
-             inferShape_t* shape) {
+  _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+            parseAttrs_t* parse, inferType_t* type,
+            inferShape_t* shape) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -306,7 +306,7 @@ extern "C" {
 #else
   void
 #endif
-  _opCallFree (void* ptr) {
+  _opCallFree(void* ptr) {
     free(ptr);
   }
 
@@ -318,9 +318,9 @@ extern "C" {
 #else
   int
 #endif
-  _opCallParseAttrs (parseAttrs_t parseAttrs, const char* const* keys,
-                     const char* const* vals, int num,
-                     int* num_in, int* num_out) {
+  _opCallParseAttrs(parseAttrs_t parseAttrs, const char* const* keys,
+                    const char* const* vals, int num,
+                    int* num_in, int* num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -338,10 +338,10 @@ extern "C" {
 #else
   int
 #endif
-  _opCallInferShape (inferShape_t inferShape, const char* const* keys,
-                     const char* const* vals, int num,
-                     unsigned int** inshapes, int* indims, int num_in,
-                     unsigned int*** outshapes, int** outdims, int num_out) {
+  _opCallInferShape(inferShape_t inferShape, const char* const* keys,
+                    const char* const* vals, int num,
+                    unsigned int** inshapes, int* indims, int num_in,
+                    unsigned int*** outshapes, int** outdims, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -387,9 +387,9 @@ extern "C" {
 #else
   int
 #endif
-  _opCallInferType (inferType_t inferType, const char* const* keys,
-                    const char* const* vals, int num,
-                    int* intypes, int num_in, int* outtypes, int num_out) {
+  _opCallInferType(inferType_t inferType, const char* const* keys,
+                   const char* const* vals, int num,
+                   int* intypes, int num_in, int* outtypes, int num_out) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -403,7 +403,7 @@ extern "C" {
     }
 
     // create a vector of types for outputs
-    std::vector<int> out_types(num_out);
+    std::vector<int> out_types(num_out, -1);
 
     int retval = inferType(attrs, in_types, out_types);
     if (!retval)
@@ -426,13 +426,13 @@ extern "C" {
 #else
   int
 #endif
-  _opCallFCompute (fcomp_t fcomp, const char* const* keys,
-                   const char* const* vals, int num,
-                   const int64_t** inshapes, int* indims,
-                   void** indata, int* intypes, int num_in,
-                   const int64_t** outshapes, int* outdims,
-                   void** outdata, int* outtypes, int num_out,
-                   xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
+  _opCallFCompute(fcomp_t fcomp, const char* const* keys,
+                  const char* const* vals, int num,
+                  const int64_t** inshapes, int* indims,
+                  void** indata, int* intypes, int num_in,
+                  const int64_t** outshapes, int* outdims,
+                  void** outdata, int* outtypes, int num_out,
+                  xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {

From c66438c0b184cdd67dfa1e1c9b9a220b0e3550ce Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Thu, 29 Aug 2019 22:26:35 +0000
Subject: [PATCH 055/111] add prototype of subgraph op

---
 example/lib_ops/Makefile         |  7 +++-
 example/lib_ops/mylib.cc         |  8 +++-
 example/lib_ops/subgraph_lib.cc  | 72 ++++++++++++++++++++++++++++++++
 example/lib_ops/test.py          | 11 ++++-
 example/lib_ops/test_subgraph.py | 50 ++++++++++++++++++++++
 5 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 example/lib_ops/subgraph_lib.cc
 create mode 100644 example/lib_ops/test_subgraph.py

diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
index f649a68eee9a..628f09aadc4c 100644
--- a/example/lib_ops/Makefile
+++ b/example/lib_ops/Makefile
@@ -15,9 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
-all:
+all: warpctc_lib subgraph_lib
+
+warpctc_lib:
 	g++ -shared -fPIC -std=gnu++0x mylib.cc -o mylib.so -I ../../include/mxnet
 
+subgraph_lib:
+	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
+
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
 
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index ac2bd974a94d..685ca4ddb2d4 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -138,7 +138,13 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-REGISTER_OP(sam)
+REGISTER_OP(gemm)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+REGISTER_OP(warpctc)
 .setFCompute(myFCompute)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
new file mode 100644
index 000000000000..e836659bf8ce
--- /dev/null
+++ b/example/lib_ops/subgraph_lib.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file subgraph_lib.cc
+ * \brief subgraph operator implementation
+ * library file
+ */
+
+#include <iostream>
+#include "lib_api.h"
+
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
+               int* num_in, int* num_out) {
+  *num_in = 2;
+  *num_out = 1;
+
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+              std::vector<int> &outtypes) {
+  outtypes[0] = intypes[0];
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+               std::vector<std::vector<unsigned int>> &outshapes) {
+  outshapes[0] = inshapes[0];
+  return MX_SUCCESS;
+}
+
+MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
+  outputs[0] = inputs[0];
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(subgraph_op)
+.setFCompute(myFCompute)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}
+
diff --git a/example/lib_ops/test.py b/example/lib_ops/test.py
index b9d5aeb49340..d1027faa71bb 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test.py
@@ -41,6 +41,15 @@
 #print inputs
 print(a)
 print(b)
+print('--------------')
 
 #compute and print output
-print(mx.nd.sam(a,b))
+print(mx.nd.gemm(a,b))
+
+# symbol api
+s = mx.sym.Variable('s')
+t = mx.sym.Variable('t')
+c = mx.sym.warpctc(s,t)
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+out = exe.forward()
+print(out)
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
new file mode 100644
index 000000000000..2791ebee2e9f
--- /dev/null
+++ b/example/lib_ops/test_subgraph.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+# This test checks if dynamic loading of library into MXNet is successful
+# and checks the end of end computation of custom operator
+
+import mxnet as mx
+import os
+
+# load library
+if (os.name=='posix'):
+    path = os.path.abspath('subgraph_lib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('subgraph_lib.so')
+    mx.library.load(path)
+
+# setup inputs to call test operator
+a = mx.nd.array([[1,2],[3,4]])
+b = mx.nd.array([[5,6],[7,8]])
+
+# imperative compute and print output
+print(mx.nd.subgraph_op(a,b))
+
+# symbolic compute
+s = mx.sym.Variable('s')
+t = mx.sym.Variable('t')
+c = mx.sym.subgraph_op(s,t)
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+out = exe.forward()
+print(out)

From 698a0b6f4dae8749df56762e3b482949311be3d3 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 30 Aug 2019 21:48:28 +0000
Subject: [PATCH 056/111] implement FMutateInput as optional attribute

---
 example/lib_ops/subgraph_lib.cc  | 49 ++++++++++++++++++++--
 example/lib_ops/test_subgraph.py |  2 +-
 include/mxnet/lib_api.h          | 71 +++++++++++++++++++++++++++-----
 src/c_api/c_api.cc               | 37 ++++++++++++++++-
 4 files changed, 142 insertions(+), 17 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index e836659bf8ce..ddea2aefb654 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -27,11 +27,22 @@
 #include <iostream>
 #include "lib_api.h"
 
+void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
+  unsigned i,j,kk;
+  for (i=0;i<n;i++) {
+    for (j=0;j<m;j++) {
+      C[i*m+j] = 0;
+      for (kk=0;kk<k;kk++) {
+        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
+      }
+    }
+  }
+}
+
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   *num_in = 2;
   *num_out = 1;
-
   return MX_SUCCESS;
 }
 
@@ -43,14 +54,43 @@ MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int
 
 MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
                std::vector<std::vector<unsigned int>> &outshapes) {
-  outshapes[0] = inshapes[0];
+  unsigned n = inshapes[0][0];
+  unsigned k = inshapes[0][1];
+  unsigned kk = inshapes[1][0];
+  unsigned m = inshapes[1][1];
+
+  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
+  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
+
+  if (k != kk)
+    return MX_FAIL;
+
+  outshapes[0].push_back(n);
+  outshapes[0].push_back(m);
   return MX_SUCCESS;
 }
 
 MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
-  outputs[0] = inputs[0];
+  //extract data pointers from tensors
+  float* input1 = inputs[0].getData<float>();
+  float* input2 = inputs[1].getData<float>();
+  float* output = outputs[0].getData<float>();
+  //set tensor shapes
+  unsigned n = inputs[0].shape[0];
+  unsigned k = inputs[0].shape[1];
+  unsigned m = inputs[1].shape[1];
+
+  gemm(input1, input2, output, n, k, m);
+
+  return MX_SUCCESS;
+}
+
+MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
+               std::vector<int> &input_indices) {
+  input_indices.push_back(1);
+  std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
   return MX_SUCCESS;
 }
 
@@ -58,7 +98,8 @@ REGISTER_OP(subgraph_op)
 .setFCompute(myFCompute)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
-.setInferShape(inferShape);
+.setInferShape(inferShape)
+.setMutateInputs(mutateInputs);
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index 2791ebee2e9f..279c6af6ca12 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -45,6 +45,6 @@
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
 c = mx.sym.subgraph_op(s,t)
-exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+exe = c.bind(ctx=mx.cpu(),args={'s':a},aux_states={'t':b})
 out = exe.forward()
 print(out)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index f0256cc616c8..bf6b7ae8d156 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -48,11 +48,17 @@ enum MXDType {
   kInt64 = 6,
 };
 
+enum MXReturnValue {
+  MX_FAIL = 0,
+  MX_SUCCESS = 1,
+};
+
 /*!
  * \brief External Tensor data structure
  */
 struct MXTensor {
-  MXTensor() { data = nullptr; }
+  MXTensor() : data(nullptr) {}
+
   MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
   : data{data}, shape{shape}, dtype{dtype} {}
 
@@ -92,11 +98,6 @@ class OpResource {
   void* _xpu_malloc;
 };
 
-enum MXReturnValue {
-  MX_FAIL = 0,
-  MX_SUCCESS = 1,
-};
-
 /*!
  * Custom Operator function templates
  */
@@ -110,6 +111,8 @@ typedef MXReturnValue (*inferType_t)(std::map<std::string, std::string>,
 typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
                                       std::vector<std::vector<unsigned int>>&,
                                       std::vector<std::vector<unsigned int>>&);
+typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
+                                      std::vector<int>&);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -117,7 +120,8 @@ typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
-    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr) {}
+    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
+    mutate_inputs(nullptr) {}
   ~CustomOp() {}
   CustomOp& setFCompute(fcomp_t fcomp) {
     fcompute = fcomp;
@@ -135,6 +139,10 @@ class CustomOp {
     infer_shape = func;
     return *this;
   }
+  CustomOp& setMutateInputs(mutateInputs_t func) {
+    mutate_inputs = func;
+    return *this;
+  }
   /*! \brief operator name */
   const char* name;
   /*! \brief operator functions */
@@ -142,6 +150,7 @@ class CustomOp {
   parseAttrs_t parse_attrs;
   inferType_t infer_type;
   inferShape_t infer_shape;
+  mutateInputs_t mutate_inputs;
 };
 
 /*!
@@ -210,21 +219,23 @@ class Registry {
 #define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
     Registry<CustomOp>::get()->add(TOSTRING(Name))
 
+/*
+ * -------------- BELOW FUNCTIONS ARE USED IN MXNET BACKEND ---------------
+ */
 
 /*!
- * \brief Following are the APIs implemented in the external library
+ * \brief Following are the C type APIs implemented in the external library
  * Each API has a #define string that is used to lookup the function in the library
  * Followed by the function declaration
  */
 
-
 #define MXLIB_OPREGSIZE_STR "_opRegSize"
 typedef int (*opRegSize_t)(void);
 
 #define MXLIB_OPREGGET_STR "_opRegGet"
 typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
-                          inferShape_t*);
+                          inferShape_t*, mutateInputs_t*);
 
 #define MXLIB_OPCALLFREE_STR "_opCallFree"
 typedef int (*opCallFree_t)(void*);
@@ -248,6 +259,10 @@ typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, in
                              const int64_t**, int*, void**, int*, int,
                              xpu_malloc_t, void*);
 
+#define MXLIB_OPCALLMUTATEINPUTS_STR "_opCallMutateInputs"
+typedef int (*opCallMutateInputs_t)(mutateInputs_t, const char* const*, const char* const*, int,
+                                    int**, int*);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -289,13 +304,14 @@ extern "C" {
 #endif
   _opRegGet(int idx, const char** name, fcomp_t* fcomp,
             parseAttrs_t* parse, inferType_t* type,
-            inferShape_t* shape) {
+            inferShape_t* shape, mutateInputs_t* mutate) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
     *parse = op.parse_attrs;
     *type = op.infer_type;
     *shape = op.infer_shape;
+    *mutate = op.mutate_inputs;
   }
 
   /*!
@@ -464,6 +480,39 @@ extern "C" {
     return fcomp(attrs, inputs, outputs, res);
   }
 
+  /*!
+   * \brief returns status of calling mutate inputs function for operator from library
+   */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opCallMutateInputs(mutateInputs_t mutate, const char* const* keys,
+                    const char* const* vals, int num,
+                    int** mutate_indices, int* indices_size) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // create a vector of mutate input indices
+    std::vector<int> mut_ind;
+
+    int retval = mutate(attrs, mut_ind);
+    if (!retval)
+      return retval;
+
+    // output the input indices
+    *indices_size = mut_ind.size();
+    *mutate_indices = static_cast<int*>(malloc (*indices_size * sizeof(int)));
+    for (int i = 0; i < *indices_size; i++) {
+      (*mutate_indices)[i] = mut_ind[i];
+    }
+
+    return retval;
+  }
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index bc432b91c333..52bff5913f7a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -130,6 +130,9 @@ int MXLoadLib(const char *path) {
   opCallFComp_t callFComp =
     get_func<opCallFComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFCOMP_STR));
 
+  opCallMutateInputs_t callMutateInputs =
+    get_func<opCallMutateInputs_t>(lib, const_cast<char*>(MXLIB_OPCALLMUTATEINPUTS_STR));
+
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
@@ -142,13 +145,15 @@ int MXLoadLib(const char *path) {
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
   for (int i = 0; i < numOps; i++) {
     const char* name;
+    // function pointers holding implementation from custom library
     fcomp_t fcomp = nullptr;
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
+    mutateInputs_t mutate = nullptr; // optional
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp, &parse, &type, &shape);
+    opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
 
     // validate custom operator functions from the dynamic library
     CHECK(fcomp != nullptr) << "Error loading '" << name
@@ -392,6 +397,32 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
+    // lambda function to convert from external mutate_inputs to internal MXNet types
+    auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      // C type placeholder for mutate input indices vector
+      int* mutate_indices = nullptr;
+      int indices_size = 0;
+
+      // call mutate inputs function
+      CHECK(callMutateInputs(mutate, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                      &mutate_indices, &indices_size))
+      << "Error calling MutateInputs for custom operator '" << name_str << "'";
+
+      std::vector<uint32_t> mutate_indices_list(indices_size);
+      for (int i=0; i<indices_size; i++) {
+        mutate_indices_list[i] = static_cast<uint32_t>(mutate_indices[i]);
+      }
+
+      return mutate_indices_list;
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
@@ -409,6 +440,8 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
+      if (mutate != nullptr)
+        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
     } else {
       // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
@@ -425,6 +458,8 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
       regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
+      if (mutate != nullptr)
+        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
     }
   }
 

From bd5561210879e04b9ece0db4731090344b8745b9 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 30 Aug 2019 22:10:02 +0000
Subject: [PATCH 057/111] fix sanity check

---
 src/c_api/c_api.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 52bff5913f7a..d13deca09542 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -150,7 +150,8 @@ int MXLoadLib(const char *path) {
     parseAttrs_t parse = nullptr;
     inferType_t type = nullptr;
     inferShape_t shape = nullptr;
-    mutateInputs_t mutate = nullptr; // optional
+    // optional attributes
+    mutateInputs_t mutate = nullptr;
 
     // get custom operator implemenation from the dynamic library
     opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
@@ -416,7 +417,7 @@ int MXLoadLib(const char *path) {
       << "Error calling MutateInputs for custom operator '" << name_str << "'";
 
       std::vector<uint32_t> mutate_indices_list(indices_size);
-      for (int i=0; i<indices_size; i++) {
+      for (int i=0; i < indices_size; i++) {
         mutate_indices_list[i] = static_cast<uint32_t>(mutate_indices[i]);
       }
 

From 35ff973f77095893961bb1191cb2e0c599f52c39 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Tue, 3 Sep 2019 04:07:25 +0000
Subject: [PATCH 058/111] replace fcompute to fcomputeEx and implement simple
 finferstoragetype

---
 src/c_api/c_api.cc | 79 +++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d13deca09542..7451a4201952 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -115,7 +115,7 @@ int MXLoadLib(const char *path) {
   if (!initialize(static_cast<int>(MXNET_VERSION)))
     LOG(FATAL) << "Library failed to initialize";
 
-  // get call functions
+  // get C type interface functions
   opCallFree_t callFree = get_func<opCallFree_t>(lib, const_cast<char*>(MXLIB_OPCALLFREE_STR));
 
   opCallParseAttrs_t callParseAttrs =
@@ -146,24 +146,24 @@ int MXLoadLib(const char *path) {
   for (int i = 0; i < numOps; i++) {
     const char* name;
     // function pointers holding implementation from custom library
-    fcomp_t fcomp = nullptr;
-    parseAttrs_t parse = nullptr;
-    inferType_t type = nullptr;
-    inferShape_t shape = nullptr;
+    fcomp_t fcomp_fp = nullptr;
+    parseAttrs_t parse_fp = nullptr;
+    inferType_t type_fp = nullptr;
+    inferShape_t shape_fp = nullptr;
     // optional attributes
-    mutateInputs_t mutate = nullptr;
+    mutateInputs_t mutate_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp, &parse, &type, &shape, &mutate);
+    opRegGet(i, &name, &fcomp_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
 
     // validate custom operator functions from the dynamic library
-    CHECK(fcomp != nullptr) << "Error loading '" << name
+    CHECK(fcomp_fp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
-    CHECK(parse != nullptr) << "Error loading '" << name
+    CHECK(parse_fp != nullptr) << "Error loading '" << name
                             << "' custom op, ParseAttrs function was not set.";
-    CHECK(type  != nullptr) << "Error loading '" << name
+    CHECK(type_fp  != nullptr) << "Error loading '" << name
                             << "' custom op, InferType function was not set.";
-    CHECK(shape != nullptr) << "Error loading '" << name
+    CHECK(shape_fp != nullptr) << "Error loading '" << name
                             << "' custom op, InferShape function was not set.";
 
     LOG(INFO) << "\tOp[" << i << "] " << name;
@@ -186,7 +186,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs for custom operator '" << name_str << "'";
 
@@ -204,7 +204,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_inputs for custom operator '" << name_str << "'";
 
@@ -222,7 +222,7 @@ int MXLoadLib(const char *path) {
 
       int num_in = -1;
       int num_out = -1;
-      CHECK(callParseAttrs(parse, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
 
@@ -262,7 +262,7 @@ int MXLoadLib(const char *path) {
       uint32_t** outshapes = nullptr;
       int* outdims = nullptr;
 
-      CHECK(callInferShape(shape, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callInferShape(shape_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            inshapes.data(), indims.data(), in_shape->size(),
                            &outshapes, &outdims, out_shape->size()))
       << "Error calling InferShape for custom operator '" << name_str << "'";
@@ -317,7 +317,7 @@ int MXLoadLib(const char *path) {
       // output types will be populated by inferType function
       std::vector<int> outtypes(out_type->size());
 
-      CHECK(callInferType(type, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callInferType(type_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            intypes.data(), in_type->size(),
                            outtypes.data(), out_type->size()))
       << "Error calling InferType for custom operator '" << name_str << "'";
@@ -331,11 +331,11 @@ int MXLoadLib(const char *path) {
     };
 
     // lambda function to convert from external fcompute to internal MXNet types
-    auto fcomp_conv = [=](const nnvm::NodeAttrs& attrs,
+    auto fcomp_lambda = [=](const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
+                          const std::vector<NDArray>& inputs,
                           const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
+                          const std::vector<NDArray>& outputs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
       for (auto kv : attrs.dict) {
@@ -350,18 +350,18 @@ int MXLoadLib(const char *path) {
 
       // convert input tensors to constituent parts
       for (size_t i = 0; i < inputs.size(); i++) {
-        in_data.push_back(inputs[i].dptr_);
-        in_shapes.push_back(inputs[i].shape_.data());
-        in_dims.push_back(inputs[i].shape_.ndim());
-        in_types.push_back(inputs[i].type_flag_);
+        in_data.push_back(inputs[i].data().dptr_);
+        in_shapes.push_back(inputs[i].shape().data());
+        in_dims.push_back(inputs[i].shape().ndim());
+        in_types.push_back(inputs[i].dtype());
       }
 
       // convert output tensors to constituent parts
       for (size_t i = 0; i < outputs.size(); i++) {
-        out_data.push_back(outputs[i].dptr_);
-        out_shapes.push_back(outputs[i].shape_.data());
-        out_dims.push_back(outputs[i].shape_.ndim());
-        out_types.push_back(outputs[i].type_flag_);
+        out_data.push_back(outputs[i].data().dptr_);
+        out_shapes.push_back(outputs[i].shape().data());
+        out_dims.push_back(outputs[i].shape().ndim());
+        out_types.push_back(outputs[i].dtype());
       }
 
       // get memory resource
@@ -388,7 +388,7 @@ int MXLoadLib(const char *path) {
       };
 
       // call fcompute function
-      CHECK(callFComp(fcomp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callFComp(fcomp_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       in_shapes.data(), in_dims.data(), in_data.data(),
                       in_types.data(), in_data.size(),
                       out_shapes.data(), out_dims.data(), out_data.data(),
@@ -412,7 +412,7 @@ int MXLoadLib(const char *path) {
       int indices_size = 0;
 
       // call mutate inputs function
-      CHECK(callMutateInputs(mutate, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+      CHECK(callMutateInputs(mutate_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                       &mutate_indices, &indices_size))
       << "Error calling MutateInputs for custom operator '" << name_str << "'";
 
@@ -424,6 +424,15 @@ int MXLoadLib(const char *path) {
       return mutate_indices_list;
     };
 
+    auto infer_storage_type = [=](const nnvm::NodeAttrs& attrs,
+                                  const int dev_mask,
+                                  DispatchMode* dispatch_mode,
+                                  std::vector<int>* in_stypes,
+                                  std::vector<int>* out_stypes) {
+      return op::storage_type_assign(out_stypes, mxnet::kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     if (regOpPtr == nullptr) {
@@ -432,6 +441,7 @@ int MXLoadLib(const char *path) {
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
+      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
       regOp.set_attr<FResourceRequest>("FResourceRequest",
                                        [](const NodeAttrs& attrs) {
                                          return std::vector<ResourceRequest>{
@@ -440,26 +450,23 @@ int MXLoadLib(const char *path) {
       regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv);
-      if (mutate != nullptr)
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
+      if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
     } else {
       // overwrite registration of existing op with custom op
       nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-
       regOp.set_attr_parser(attr_parser);
       regOp.set_num_inputs(num_inputs);
       regOp.set_num_outputs(num_outputs);
-
       regOp.arguments.clear();
       regOp.add_argument("data", "NDArray[]", "Source inputs");
-
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FCompute>("FCompute<cpu>", fcomp_conv, 11);
-      if (mutate != nullptr)
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
+      if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
     }
   }

From f243e2fd632a9800ca6b563c18369cc7b1d3a0bf Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 07:13:18 +0000
Subject: [PATCH 059/111] changed fcompute to forward

---
 example/lib_ops/mylib.cc        |  6 +++---
 example/lib_ops/subgraph_lib.cc |  4 ++--
 include/mxnet/lib_api.h         | 11 ++++++++---
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 685ca4ddb2d4..5d09760d7b74 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -43,7 +43,7 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 }
 
 
-MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //validate inputs
@@ -139,13 +139,13 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
 }
 
 REGISTER_OP(gemm)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
 REGISTER_OP(warpctc)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape);
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index ddea2aefb654..97c2137931f0 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -70,7 +70,7 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-MXReturnValue myFCompute(std::map<std::string,std::string> attrs,
+MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
   //extract data pointers from tensors
@@ -95,7 +95,7 @@ MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
 }
 
 REGISTER_OP(subgraph_op)
-.setFCompute(myFCompute)
+.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index bf6b7ae8d156..2ba51ad28f47 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -120,13 +120,17 @@ typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
-    parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
-    mutate_inputs(nullptr) {}
+    fgradient(nullptr), parse_attrs(nullptr), infer_type(nullptr),
+    infer_shape(nullptr), mutate_inputs(nullptr) {}
   ~CustomOp() {}
-  CustomOp& setFCompute(fcomp_t fcomp) {
+  CustomOp& setForward(fcomp_t fcomp) {
     fcompute = fcomp;
     return *this;
   }
+  CustomOp& setGradient(fcomp_t fcomp) {
+    fgradient = fcomp;
+    return *this;
+  }
   CustomOp& setParseAttrs(parseAttrs_t func) {
     parse_attrs = func;
     return *this;
@@ -147,6 +151,7 @@ class CustomOp {
   const char* name;
   /*! \brief operator functions */
   fcomp_t fcompute;
+  fcomp_t fgradient;
   parseAttrs_t parse_attrs;
   inferType_t infer_type;
   inferShape_t infer_shape;

From efbb858f4a51ae449ae664694872881f19d20e13 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 07:43:06 +0000
Subject: [PATCH 060/111] initial commit with fgradient support

---
 include/mxnet/lib_api.h |  5 +++--
 src/c_api/c_api.cc      | 45 +++++++++++++++++++++++++----------------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 2ba51ad28f47..6c1166a7f147 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -238,7 +238,7 @@ class Registry {
 typedef int (*opRegSize_t)(void);
 
 #define MXLIB_OPREGGET_STR "_opRegGet"
-typedef int (*opRegGet_t)(int, const char**, fcomp_t*,
+typedef int (*opRegGet_t)(int, const char**, fcomp_t*, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*, mutateInputs_t*);
 
@@ -307,12 +307,13 @@ extern "C" {
 #else
   void
 #endif
-  _opRegGet(int idx, const char** name, fcomp_t* fcomp,
+  _opRegGet(int idx, const char** name, fcomp_t* fcomp, fcomp_t* fgrad,
             parseAttrs_t* parse, inferType_t* type,
             inferShape_t* shape, mutateInputs_t* mutate) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
+    *fgrad = op.fgradient;
     *parse = op.parse_attrs;
     *type = op.infer_type;
     *shape = op.infer_shape;
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7451a4201952..dd5cf7686dd0 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -147,6 +147,7 @@ int MXLoadLib(const char *path) {
     const char* name;
     // function pointers holding implementation from custom library
     fcomp_t fcomp_fp = nullptr;
+    fcomp_t fgrad_fp = nullptr;
     parseAttrs_t parse_fp = nullptr;
     inferType_t type_fp = nullptr;
     inferShape_t shape_fp = nullptr;
@@ -154,7 +155,7 @@ int MXLoadLib(const char *path) {
     mutateInputs_t mutate_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
+    opRegGet(i, &name, &fcomp_fp, &fgrad_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
 
     // validate custom operator functions from the dynamic library
     CHECK(fcomp_fp != nullptr) << "Error loading '" << name
@@ -435,32 +436,40 @@ int MXLoadLib(const char *path) {
 
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
+    nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
+    regOp.set_attr_parser(attr_parser);
+    regOp.set_num_inputs(num_inputs);
+    regOp.set_num_outputs(num_outputs);
+    regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
+    regOp.set_attr<FResourceRequest>("FResourceRequest",
+                                     [](const NodeAttrs& attrs) {
+                                       return std::vector<ResourceRequest>{
+                                         ResourceRequest::kTempSpace};
+                                     });
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
-      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-      regOp.set_attr_parser(attr_parser);
-      regOp.set_num_inputs(num_inputs);
-      regOp.set_num_outputs(num_outputs);
-      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-      regOp.set_attr<FResourceRequest>("FResourceRequest",
-                                       [](const NodeAttrs& attrs) {
-                                         return std::vector<ResourceRequest>{
-                                           ResourceRequest::kTempSpace};
-                                       });
-      regOp.add_argument("data", "NDArray[]", "Source inputs");
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
       regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
+      if (fgrad_fp != nullptr) {
+        // regOp.set_attr<nnvm::FGradient>("FGradient");
+        std::string grad_name(std::string("_backward_") + name);
+        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
+        // gradOp.set_attr_parser();
+        // gradOp.set_num_inputs();
+        // gradOp.set_num_outputs();
+        // gradOp.set_attr<FInferStorageType>("FInferStorageType");
+        // gradOp.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+        // return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+        // })
+        // gradOp.set_attr<FComputeEx>("FComputeEx<cpu>");
+      }
     } else {
       // overwrite registration of existing op with custom op
-      nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
-      regOp.set_attr_parser(attr_parser);
-      regOp.set_num_inputs(num_inputs);
-      regOp.set_num_outputs(num_outputs);
       regOp.arguments.clear();
-      regOp.add_argument("data", "NDArray[]", "Source inputs");
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
@@ -468,7 +477,9 @@ int MXLoadLib(const char *path) {
       regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
+      // TODO(samskalicky): add fgrad support here too
     }
+    regOp.add_argument("data", "NDArray[]", "Source inputs");
   }
 
   API_END();

From 0032143dad878fdf1a94fd793b1b713b4b2898be Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 15:58:02 +0000
Subject: [PATCH 061/111] enabled gradient registration

---
 src/c_api/c_api.cc | 130 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 104 insertions(+), 26 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index dd5cf7686dd0..20428835964d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -50,6 +50,7 @@
 #include "../initialize.h"
 #include "./c_api_common.h"
 #include "../operator/custom/custom-inl.h"
+#include "../operator/operator_common.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tvmop/op_module.h"
 #include "../common/utils.h"
@@ -230,6 +231,25 @@ int MXLoadLib(const char *path) {
       return num_out;
     };
 
+    // lambda function to call parse attributes and return the number of inputs and outputs
+    // for gradient computation
+    auto num_inouts = [=](const NodeAttrs& attrs) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      int num_in = -1;
+      int num_out = -1;
+      CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                           &num_in, &num_out))
+      << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
+
+      return num_in + num_out;
+    };
+    
     // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
@@ -332,11 +352,12 @@ int MXLoadLib(const char *path) {
     };
 
     // lambda function to convert from external fcompute to internal MXNet types
-    auto fcomp_lambda = [=](const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
+    auto fcomp_lambda = [=](fcomp_t fcomp_fp,
+                     const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
       for (auto kv : attrs.dict) {
@@ -399,6 +420,22 @@ int MXLoadLib(const char *path) {
       // return type void
     };
 
+    auto forward_lambda = [=](const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+      return fcomp_lambda(fcomp_fp, attrs, ctx, inputs, req, outputs);
+    };
+
+    auto gradient_lambda = [=](const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+      return fcomp_lambda(fgrad_fp, attrs, ctx, inputs, req, outputs);
+    };
+    
     // lambda function to convert from external mutate_inputs to internal MXNet types
     auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
       // convert attributes to vector of char*
@@ -425,15 +462,45 @@ int MXLoadLib(const char *path) {
       return mutate_indices_list;
     };
 
+    // lambda function to set storage types
     auto infer_storage_type = [=](const nnvm::NodeAttrs& attrs,
-                                  const int dev_mask,
-                                  DispatchMode* dispatch_mode,
-                                  std::vector<int>* in_stypes,
-                                  std::vector<int>* out_stypes) {
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int>* in_stypes,
+                                std::vector<int>* out_stypes) {
+      // set outputs as dense
       return op::storage_type_assign(out_stypes, mxnet::kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
     };
 
+    /*
+     * GradStruct
+     * this struct sets that the operator will use both the inputs and the outputs to compute
+     * the gradient. The order is: [grads, inputs, outputs]
+     */
+    struct GradStruct {
+      const char *op_name;
+      std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                              const std::vector<nnvm::NodeEntry>& ograds) const {
+        // copy gradients first
+        std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+        // copy inputs second
+        for (auto& h : n->inputs) {
+          heads.push_back(h);
+        }
+        // copy outputs last
+        uint32_t n_out = n->num_outputs();
+        for (uint32_t i = 0; i < n_out; ++i) {
+          heads.emplace_back(n, i, 0);
+        }
+        return mxnet::op::MakeGradNode(op_name, n, heads, n->attrs.dict);
+      }
+    };
+
+    auto resc_req = [=](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
@@ -441,31 +508,28 @@ int MXLoadLib(const char *path) {
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
     regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-    regOp.set_attr<FResourceRequest>("FResourceRequest",
-                                     [](const NodeAttrs& attrs) {
-                                       return std::vector<ResourceRequest>{
-                                         ResourceRequest::kTempSpace};
-                                     });
+    regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda);
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
+      // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
+      // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
-        // regOp.set_attr<nnvm::FGradient>("FGradient");
         std::string grad_name(std::string("_backward_") + name);
+        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()});
+
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
-        // gradOp.set_attr_parser();
-        // gradOp.set_num_inputs();
-        // gradOp.set_num_outputs();
-        // gradOp.set_attr<FInferStorageType>("FInferStorageType");
-        // gradOp.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-        // return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-        // })
-        // gradOp.set_attr<FComputeEx>("FComputeEx<cpu>");
+        gradOp.set_attr_parser(attr_parser);
+        gradOp.set_num_inputs(num_inouts);
+        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
+        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda);
       }
     } else {
       // overwrite registration of existing op with custom op
@@ -474,10 +538,24 @@ int MXLoadLib(const char *path) {
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", fcomp_lambda, 11);
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
+      // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
-      // TODO(samskalicky): add fgrad support here too
+      // optionally add fgradient if user specified a function
+      if (fgrad_fp != nullptr) {
+        std::string grad_name(std::string("_backward_") + name);
+        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()}, 11);
+
+        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
+        gradOp.set_attr_parser(attr_parser);
+        gradOp.set_num_inputs(num_inouts);
+        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
+        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda, 11);
+      }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");
   }

From 14ef3a7c6900cd256d5f175bf3b1b78018141ae5 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Wed, 4 Sep 2019 18:05:43 +0000
Subject: [PATCH 062/111] fixed whitespace

---
 src/c_api/c_api.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 20428835964d..1ba78e2a5c0d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -249,7 +249,7 @@ int MXLoadLib(const char *path) {
 
       return num_in + num_out;
     };
-    
+
     // lambda function to call infer shape
     auto infer_shape = [=] (const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *in_shape,
@@ -435,7 +435,7 @@ int MXLoadLib(const char *path) {
                               const std::vector<NDArray>& outputs) {
       return fcomp_lambda(fgrad_fp, attrs, ctx, inputs, req, outputs);
     };
-    
+
     // lambda function to convert from external mutate_inputs to internal MXNet types
     auto mutate_inputs = [=](const nnvm::NodeAttrs& attrs) {
       // convert attributes to vector of char*
@@ -520,7 +520,7 @@ int MXLoadLib(const char *path) {
       // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
         std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()});
+        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()});
 
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
@@ -529,7 +529,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_outputs(num_outputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
-        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", gradient_lambda);
       }
     } else {
       // overwrite registration of existing op with custom op
@@ -545,7 +545,7 @@ int MXLoadLib(const char *path) {
       // optionally add fgradient if user specified a function
       if (fgrad_fp != nullptr) {
         std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient",GradStruct{grad_name.c_str()}, 11);
+        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()}, 11);
 
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
@@ -554,7 +554,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_outputs(num_outputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
-        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>",gradient_lambda, 11);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", gradient_lambda, 11);
       }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");

From eec71d6e32a1cf1cd43d672ff3d1c7645d4013b4 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 6 Sep 2019 01:05:13 +0000
Subject: [PATCH 063/111] prototype of createopstate and fstatefulcompute

---
 example/lib_ops/subgraph_lib.cc |  49 +++++-------
 include/mxnet/lib_api.h         | 132 +++++++++++++++++++++++++++++++-
 src/c_api/c_api.cc              |  94 ++++++++++++++++++++++-
 3 files changed, 236 insertions(+), 39 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 97c2137931f0..0713bed0ec55 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -27,18 +27,6 @@
 #include <iostream>
 #include "lib_api.h"
 
-void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
-  unsigned i,j,kk;
-  for (i=0;i<n;i++) {
-    for (j=0;j<m;j++) {
-      C[i*m+j] = 0;
-      for (kk=0;kk<k;kk++) {
-        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
-      }
-    }
-  }
-}
-
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   *num_in = 2;
@@ -70,23 +58,6 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-MXReturnValue forward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
-  //extract data pointers from tensors
-  float* input1 = inputs[0].getData<float>();
-  float* input2 = inputs[1].getData<float>();
-  float* output = outputs[0].getData<float>();
-  //set tensor shapes
-  unsigned n = inputs[0].shape[0];
-  unsigned k = inputs[0].shape[1];
-  unsigned m = inputs[1].shape[1];
-
-  gemm(input1, input2, output, n, k, m);
-
-  return MX_SUCCESS;
-}
-
 MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
                std::vector<int> &input_indices) {
   input_indices.push_back(1);
@@ -94,12 +65,28 @@ MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
   return MX_SUCCESS;
 }
 
+MXReturnValue createOpState(std::map<std::string,std::string> attrs,
+                            CustomStatefulOp** op_inst) {
+  *op_inst = new CustomStatefulOp();
+  std::cout << "create op state run" << std::endl;
+  return MX_SUCCESS;
+}
+
+MXReturnValue forwardStateful(CustomStatefulOp* op_inst,
+                              std::vector<MXTensor> inputs,
+                              std::vector<MXTensor> outputs) {
+  op_inst->count++;
+  std::cout << "forward op state run" << std::endl;
+  return MX_SUCCESS;
+}
+
 REGISTER_OP(subgraph_op)
-.setForward(forward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape)
-.setMutateInputs(mutateInputs);
+.setMutateInputs(mutateInputs)
+.setCreateOpState(createOpState)
+.setForwardStateful(forwardStateful);
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 6c1166a7f147..cbe846b085a1 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -98,6 +98,42 @@ class OpResource {
   void* _xpu_malloc;
 };
 
+/*!
+ * \brief StatefulOp wrapper class to pass to backend OpState
+ */
+class CustomStatefulOpWrapper {
+ public:
+  CustomStatefulOpWrapper(void* inst) : instance(inst) {}
+
+  void* get_instance() { return instance; }
+
+ private:
+  void* instance;
+};
+
+/*!
+ * \brief An prototype interface class for library author creating stateful op
+ */
+class CustomStatefulOp {
+ public:
+  CustomStatefulOp() {
+    std::cout << "CustomStatefulOp constructor called" << std::endl;
+    subgraph_sym = "json";
+    count = 0;
+  }
+
+  void Forward() {
+    std::cout << "CustomStatefulOp forward called" << std::endl;
+  }
+
+  ~CustomStatefulOp() {
+    std::cout << "CustomStatefulOp destructor called" << std::endl;
+  }
+
+  std::string subgraph_sym;
+  int count;
+};
+
 /*!
  * Custom Operator function templates
  */
@@ -113,6 +149,10 @@ typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
                                       std::vector<std::vector<unsigned int>>&);
 typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
                                       std::vector<int>&);
+typedef MXReturnValue (*createOpState_t)(std::map<std::string, std::string>,
+                                      CustomStatefulOp**);
+typedef MXReturnValue (*fstateful_t)(CustomStatefulOp*, std::vector<MXTensor>,
+                                      std::vector<MXTensor>);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -120,8 +160,8 @@ typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
 class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
-    fgradient(nullptr), parse_attrs(nullptr), infer_type(nullptr),
-    infer_shape(nullptr), mutate_inputs(nullptr) {}
+    fgradient(nullptr), parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
+    mutate_inputs(nullptr), create_op_state(nullptr), fstateful(nullptr) {}
   ~CustomOp() {}
   CustomOp& setForward(fcomp_t fcomp) {
     fcompute = fcomp;
@@ -147,6 +187,14 @@ class CustomOp {
     mutate_inputs = func;
     return *this;
   }
+  CustomOp& setCreateOpState(createOpState_t func) {
+    create_op_state = func;
+    return *this;
+  }
+  CustomOp& setForwardStateful(fstateful_t func) {
+    fstateful = func;
+    return *this;
+  }
   /*! \brief operator name */
   const char* name;
   /*! \brief operator functions */
@@ -156,6 +204,8 @@ class CustomOp {
   inferType_t infer_type;
   inferShape_t infer_shape;
   mutateInputs_t mutate_inputs;
+  createOpState_t create_op_state;
+  fstateful_t fstateful;
 };
 
 /*!
@@ -240,7 +290,8 @@ typedef int (*opRegSize_t)(void);
 #define MXLIB_OPREGGET_STR "_opRegGet"
 typedef int (*opRegGet_t)(int, const char**, fcomp_t*, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
-                          inferShape_t*, mutateInputs_t*);
+                          inferShape_t*, mutateInputs_t*,
+                          createOpState_t*, fstateful_t*);
 
 #define MXLIB_OPCALLFREE_STR "_opCallFree"
 typedef int (*opCallFree_t)(void*);
@@ -268,6 +319,15 @@ typedef int (*opCallFComp_t)(fcomp_t, const char* const*, const char* const*, in
 typedef int (*opCallMutateInputs_t)(mutateInputs_t, const char* const*, const char* const*, int,
                                     int**, int*);
 
+#define MXLIB_OPCALLCREATEOPSTATE_STR "_opCallCreateOpState"
+typedef int (*opCallCreateOpState_t)(createOpState_t, const char* const*, const char* const*, int,
+                                     void**);
+
+#define MXLIB_OPCALLFSTATEFUL_STR "_opCallFStateful"
+typedef int (*opCallFStateful_t)(fstateful_t, void*,
+                                 const int64_t**, int*, void**, int*, int,
+                                 const int64_t**, int*, void**, int*, int);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -309,7 +369,8 @@ extern "C" {
 #endif
   _opRegGet(int idx, const char** name, fcomp_t* fcomp, fcomp_t* fgrad,
             parseAttrs_t* parse, inferType_t* type,
-            inferShape_t* shape, mutateInputs_t* mutate) {
+            inferShape_t* shape, mutateInputs_t* mutate,
+            createOpState_t* create_op, fstateful_t* fstateful) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -318,6 +379,8 @@ extern "C" {
     *type = op.infer_type;
     *shape = op.infer_shape;
     *mutate = op.mutate_inputs;
+    *create_op = op.create_op_state;
+    *fstateful = op.fstateful;
   }
 
   /*!
@@ -519,6 +582,67 @@ extern "C" {
 
     return retval;
   }
+
+  /*!
+   * \brief returns status of calling create stateful op function for operator from library
+   */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opCallCreateOpState(createOpState_t create_op, const char* const* keys,
+                    const char* const* vals, int num,
+                    void** state_op) {
+    // create map of attributes from list
+    std::map<std::string, std::string> attrs;
+    for (int i = 0; i < num; i++) {
+      attrs[std::string(keys[i])] = std::string(vals[i]);
+    }
+
+    // void pointer to hold custom state op instance created in custom library
+    CustomStatefulOp** op_ptr = reinterpret_cast<CustomStatefulOp**>(state_op);
+    return create_op(attrs, op_ptr);
+  }
+
+  /*!
+   * \brief returns status of calling FStateful function for operator from library
+   */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opCallFStateful(fstateful_t fstateful, void* state_op_inst,
+                  const int64_t** inshapes, int* indims,
+                  void** indata, int* intypes, int num_in,
+                  const int64_t** outshapes, int* outdims,
+                  void** outdata, int* outtypes, int num_out) {
+    // create a vector of tensors for inputs
+    std::vector<MXTensor> inputs(num_in);
+    for (int i = 0; i < num_in; i++) {
+      inputs[i].data = indata[i];
+      inputs[i].dtype = (MXDType)intypes[i];
+      for (int j = 0; j < indims[i]; j++) {
+        inputs[i].shape.push_back(inshapes[i][j]);
+      }
+    }
+
+    // create a vector of tensors for outputs
+    std::vector<MXTensor> outputs(num_out);
+    for (int i = 0; i < num_out; i++) {
+      outputs[i].data = outdata[i];
+      outputs[i].dtype = (MXDType) outtypes[i];
+      for (int j = 0; j < outdims[i]; j++) {
+        outputs[i].shape.push_back(outshapes[i][j]);
+      }
+    }
+
+    // pass the stateful op instance to stateful forward in custom library
+    CustomStatefulOp* state_op = reinterpret_cast<CustomStatefulOp*>(state_op_inst);
+    return fstateful(state_op, inputs, outputs);
+  }
+
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1ba78e2a5c0d..766e2ca606ee 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -134,6 +134,12 @@ int MXLoadLib(const char *path) {
   opCallMutateInputs_t callMutateInputs =
     get_func<opCallMutateInputs_t>(lib, const_cast<char*>(MXLIB_OPCALLMUTATEINPUTS_STR));
 
+  opCallCreateOpState_t callCreateOpState =
+    get_func<opCallCreateOpState_t>(lib, const_cast<char*>(MXLIB_OPCALLCREATEOPSTATE_STR));
+
+  opCallFStateful_t callFStateful=
+    get_func<opCallFStateful_t>(lib, const_cast<char*>(MXLIB_OPCALLFSTATEFUL_STR));
+
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
@@ -154,12 +160,15 @@ int MXLoadLib(const char *path) {
     inferShape_t shape_fp = nullptr;
     // optional attributes
     mutateInputs_t mutate_fp = nullptr;
+    createOpState_t create_op_state_fp = nullptr;
+    fstateful_t fstateful_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
-    opRegGet(i, &name, &fcomp_fp, &fgrad_fp, &parse_fp, &type_fp, &shape_fp, &mutate_fp);
+    opRegGet(i, &name, &fcomp_fp, &fgrad_fp, &parse_fp, &type_fp, &shape_fp,
+                &mutate_fp, &create_op_state_fp, &fstateful_fp);
 
     // validate custom operator functions from the dynamic library
-    CHECK(fcomp_fp != nullptr) << "Error loading '" << name
+    CHECK(fcomp_fp != nullptr || fstateful_fp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
     CHECK(parse_fp != nullptr) << "Error loading '" << name
                             << "' custom op, ParseAttrs function was not set.";
@@ -468,6 +477,9 @@ int MXLoadLib(const char *path) {
                                 DispatchMode* dispatch_mode,
                                 std::vector<int>* in_stypes,
                                 std::vector<int>* out_stypes) {
+      // TODO(ziyimu): remove this dense enforce check after supporting sparse tensor
+      CHECK(mxnet::common::ContainsOnlyStorage(*in_stypes, mxnet::kDefaultStorage))
+      << "Error input tensors are not dense for custom operator '" << name_str << "'";
       // set outputs as dense
       return op::storage_type_assign(out_stypes, mxnet::kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
@@ -501,6 +513,68 @@ int MXLoadLib(const char *path) {
       return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
     };
 
+    // library author should implement and return a 'state' which points to an instance
+    // in lambda we create OpStatePtr using the returned 'state'
+    auto create_op_state = [=] (const NodeAttrs& attrs,
+                                Context ctx,
+                                const std::vector<TShape>& in_shapes,
+                                const std::vector<int>& in_types) {
+      // convert attributes to vector of char*
+      std::vector<const char*> attr_keys, attr_vals;
+      for (auto kv : attrs.dict) {
+        attr_keys.push_back(kv.first.c_str());
+        attr_vals.push_back(kv.second.c_str());
+      }
+
+      // create a pointer to hold custom op state object
+      void* state_op_inst = nullptr;
+      CHECK(callCreateOpState(create_op_state_fp,
+                              attr_keys.data(), attr_vals.data(), attr_keys.size(),
+                              &state_op_inst));
+      CHECK(state_op_inst != nullptr);
+
+      return OpStatePtr::Create<CustomStatefulOpWrapper>(state_op_inst);
+    };
+
+    auto fstateful_forward_lambda = [=](const OpStatePtr& state_ptr,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+      std::vector<void*> in_data, out_data;
+      std::vector<const int64_t *> in_shapes, out_shapes;
+      std::vector<int> in_dims, out_dims;
+      std::vector<int> in_types, out_types;
+
+      // convert input tensors to constituent parts
+      for (size_t i = 0; i < inputs.size(); i++) {
+        in_data.push_back(inputs[i].data().dptr_);
+        in_shapes.push_back(inputs[i].shape().data());
+        in_dims.push_back(inputs[i].shape().ndim());
+        in_types.push_back(inputs[i].dtype());
+      }
+
+      // convert output tensors to constituent parts
+      for (size_t i = 0; i < outputs.size(); i++) {
+        out_data.push_back(outputs[i].data().dptr_);
+        out_shapes.push_back(outputs[i].shape().data());
+        out_dims.push_back(outputs[i].shape().ndim());
+        out_types.push_back(outputs[i].dtype());
+      }
+
+      // retrieve op state object created from CreateOpState
+      CustomStatefulOpWrapper& op = state_ptr.get_state<CustomStatefulOpWrapper>();
+      void* state_op_inst = op.get_instance();
+      CHECK(state_op_inst != nullptr);
+
+      CHECK(callFStateful(fstateful_fp, state_op_inst,
+                          in_shapes.data(), in_dims.data(), in_data.data(),
+                          in_types.data(), in_data.size(),
+                          out_shapes.data(), out_dims.data(), out_data.data(),
+                          out_types.data(), out_data.size()));
+      // return type void
+    };
+
     // check if operator is already registered
     const nnvm::Op *regOpPtr = dmlc::Registry<nnvm::Op>::Get()->Find(name);
     nnvm::Op &regOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(name);
@@ -511,9 +585,15 @@ int MXLoadLib(const char *path) {
     regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
+      if (fstateful_fp != nullptr) {
+        regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state);
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda);
+      }
+      else {
+        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
+      }
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
       // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
@@ -536,9 +616,15 @@ int MXLoadLib(const char *path) {
       regOp.arguments.clear();
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
+      if (fstateful_fp != nullptr) {
+        regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state, 11);
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda, 11);
+      }
+      else {
+        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
+      }
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
       regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
       // optionally add fmutate inputs if user specified a function
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);

From abcb8cb558b0adc1d2c61cbfbf407315c5ebab0f Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Fri, 6 Sep 2019 20:55:01 +0000
Subject: [PATCH 064/111] make custom state op interface work

---
 example/lib_ops/subgraph_lib.cc  | 33 ++++++++++++++++++++++++++++----
 example/lib_ops/test_subgraph.py |  1 +
 include/mxnet/lib_api.h          | 20 ++-----------------
 3 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 0713bed0ec55..b497cb8c3512 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -65,18 +65,43 @@ MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
   return MX_SUCCESS;
 }
 
+class MyStatefulOp : public CustomStatefulOp {
+ public:
+  MyStatefulOp(std::string sym, int count) : subgraph_sym(sym), count(count) {}
+
+  void Forward() {
+    count++;
+  }
+
+  int State() {
+    return count;
+  }
+
+  ~MyStatefulOp() {}
+
+ private:
+  std::string subgraph_sym;
+  int count;
+};
+
 MXReturnValue createOpState(std::map<std::string,std::string> attrs,
                             CustomStatefulOp** op_inst) {
-  *op_inst = new CustomStatefulOp();
-  std::cout << "create op state run" << std::endl;
+  *op_inst = new MyStatefulOp("json", 0);
+  std::cout << "create op state successful" << std::endl;
   return MX_SUCCESS;
 }
 
 MXReturnValue forwardStateful(CustomStatefulOp* op_inst,
                               std::vector<MXTensor> inputs,
                               std::vector<MXTensor> outputs) {
-  op_inst->count++;
-  std::cout << "forward op state run" << std::endl;
+  MyStatefulOp* my_op_inst = static_cast<MyStatefulOp*>(op_inst);
+  if (my_op_inst == nullptr) {
+    std::cout << "stateful op loading failed" << std::endl;
+    return MX_FAIL;
+  }
+
+  my_op_inst->Forward();
+  std::cout << "forward op state run " << my_op_inst->State() << std::endl;
   return MX_SUCCESS;
 }
 
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index 279c6af6ca12..8747b894fbac 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -47,4 +47,5 @@
 c = mx.sym.subgraph_op(s,t)
 exe = c.bind(ctx=mx.cpu(),args={'s':a},aux_states={'t':b})
 out = exe.forward()
+out = exe.forward()
 print(out)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index cbe846b085a1..b6af84c03592 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -104,9 +104,7 @@ class OpResource {
 class CustomStatefulOpWrapper {
  public:
   CustomStatefulOpWrapper(void* inst) : instance(inst) {}
-
   void* get_instance() { return instance; }
-
  private:
   void* instance;
 };
@@ -116,22 +114,8 @@ class CustomStatefulOpWrapper {
  */
 class CustomStatefulOp {
  public:
-  CustomStatefulOp() {
-    std::cout << "CustomStatefulOp constructor called" << std::endl;
-    subgraph_sym = "json";
-    count = 0;
-  }
-
-  void Forward() {
-    std::cout << "CustomStatefulOp forward called" << std::endl;
-  }
-
-  ~CustomStatefulOp() {
-    std::cout << "CustomStatefulOp destructor called" << std::endl;
-  }
-
-  std::string subgraph_sym;
-  int count;
+  virtual void Forward() = 0;
+  virtual ~CustomStatefulOp() = 0;
 };
 
 /*!

From 9cf0455d202b4bcdfc724ae42f3ccc13c41da763 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Mon, 9 Sep 2019 06:52:58 +0000
Subject: [PATCH 065/111] subgraph forward

---
 example/lib_ops/mylib.cc        |  2 --
 example/lib_ops/subgraph_lib.cc | 27 +++++++++++++++++--------
 include/mxnet/lib_api.h         |  8 +++++++-
 src/c_api/c_api.cc              | 36 +++++++++++++++++++++++----------
 4 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 5d09760d7b74..8f3c290ddf16 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -42,7 +42,6 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   }
 }
 
-
 MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
@@ -159,4 +158,3 @@ MXReturnValue initialize(int version) {
     return MX_FAIL;
   }
 }
-
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index b497cb8c3512..3c9d2bc4ff7d 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -69,14 +69,21 @@ class MyStatefulOp : public CustomStatefulOp {
  public:
   MyStatefulOp(std::string sym, int count) : subgraph_sym(sym), count(count) {}
 
-  void Forward() {
+  void Forward(std::vector<MXTensor>& inputs, std::vector<MXTensor>& outputs) {
     count++;
+    float* input1 = inputs[0].getData<float>();
+    float* output = outputs[0].getData<float>();
+    unsigned n = inputs[0].shape[0];
+    unsigned m = inputs[0].shape[1];
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < m; j++) {
+        output[i * m + j] = input1[i * m + j] + count;
+      }
+    }
+    std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
   }
 
-  int State() {
-    return count;
-  }
-
+  int State() { return count; }
   ~MyStatefulOp() {}
 
  private:
@@ -86,7 +93,11 @@ class MyStatefulOp : public CustomStatefulOp {
 
 MXReturnValue createOpState(std::map<std::string,std::string> attrs,
                             CustomStatefulOp** op_inst) {
-  *op_inst = new MyStatefulOp("json", 0);
+  std::string serialized_subgraph = "[]";
+  if (attrs.count(SUBGRAPH)) {
+    serialized_subgraph = attrs[SUBGRAPH];
+  }
+  *op_inst = new MyStatefulOp(serialized_subgraph, 0);
   std::cout << "create op state successful" << std::endl;
   return MX_SUCCESS;
 }
@@ -94,13 +105,14 @@ MXReturnValue createOpState(std::map<std::string,std::string> attrs,
 MXReturnValue forwardStateful(CustomStatefulOp* op_inst,
                               std::vector<MXTensor> inputs,
                               std::vector<MXTensor> outputs) {
+  // retrieve the statful op instance
   MyStatefulOp* my_op_inst = static_cast<MyStatefulOp*>(op_inst);
   if (my_op_inst == nullptr) {
     std::cout << "stateful op loading failed" << std::endl;
     return MX_FAIL;
   }
 
-  my_op_inst->Forward();
+  my_op_inst->Forward(inputs, outputs);
   std::cout << "forward op state run " << my_op_inst->State() << std::endl;
   return MX_SUCCESS;
 }
@@ -122,4 +134,3 @@ MXReturnValue initialize(int version) {
     return MX_FAIL;
   }
 }
-
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b6af84c03592..c870f41d0107 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -109,12 +109,18 @@ class CustomStatefulOpWrapper {
   void* instance;
 };
 
+/*!
+ * \brief Macro to help passing serialized subgraph through attribute dict
+ */
+#define SUBGRAPH "subgraph_sym_json"
+
 /*!
  * \brief An prototype interface class for library author creating stateful op
  */
 class CustomStatefulOp {
  public:
-  virtual void Forward() = 0;
+  virtual void Forward(std::vector<MXTensor>& inputs,
+                       std::vector<MXTensor>& outputs) = 0;
   virtual ~CustomStatefulOp() = 0;
 };
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 766e2ca606ee..867efe2cddfe 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -54,6 +54,7 @@
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tvmop/op_module.h"
 #include "../common/utils.h"
+#include "nnvm/pass_functions.h"
 
 using namespace mxnet;
 
@@ -137,7 +138,7 @@ int MXLoadLib(const char *path) {
   opCallCreateOpState_t callCreateOpState =
     get_func<opCallCreateOpState_t>(lib, const_cast<char*>(MXLIB_OPCALLCREATEOPSTATE_STR));
 
-  opCallFStateful_t callFStateful=
+  opCallFStateful_t callFStateful =
     get_func<opCallFStateful_t>(lib, const_cast<char*>(MXLIB_OPCALLFSTATEFUL_STR));
 
   // get number of operators registered in the library
@@ -526,12 +527,24 @@ int MXLoadLib(const char *path) {
         attr_vals.push_back(kv.second.c_str());
       }
 
+      // convert subgraph symbol from node attributes to char*
+      if (!attrs.subgraphs.empty()) {
+        nnvm::Graph g;
+        g.outputs = attrs.subgraphs[0].get()->outputs;
+        const std::string serialized_subgraph = nnvm::pass::SaveJSON(g);
+        const std::string subgraph = SUBGRAPH;
+        attr_keys.push_back(subgraph.c_str());
+        attr_vals.push_back(serialized_subgraph.c_str());
+      }
+
       // create a pointer to hold custom op state object
       void* state_op_inst = nullptr;
-      CHECK(callCreateOpState(create_op_state_fp,
-                              attr_keys.data(), attr_vals.data(), attr_keys.size(),
-                              &state_op_inst));
-      CHECK(state_op_inst != nullptr);
+      CHECK(callCreateOpState(create_op_state_fp, attr_keys.data(), attr_vals.data(),
+                              attr_keys.size(), &state_op_inst))
+      << "Error calling CreateOpState for custom operator '" << name_str << "'";
+
+      CHECK(state_op_inst != nullptr)
+      << "Error custom library failed to create stateful operator '" << name_str << "'";
 
       return OpStatePtr::Create<CustomStatefulOpWrapper>(state_op_inst);
     };
@@ -565,13 +578,16 @@ int MXLoadLib(const char *path) {
       // retrieve op state object created from CreateOpState
       CustomStatefulOpWrapper& op = state_ptr.get_state<CustomStatefulOpWrapper>();
       void* state_op_inst = op.get_instance();
-      CHECK(state_op_inst != nullptr);
+      CHECK(state_op_inst != nullptr)
+      << "Error MXNet cannot load custom stateful operator'" << name_str << "'";
 
       CHECK(callFStateful(fstateful_fp, state_op_inst,
                           in_shapes.data(), in_dims.data(), in_data.data(),
                           in_types.data(), in_data.size(),
                           out_shapes.data(), out_dims.data(), out_data.data(),
-                          out_types.data(), out_data.size()));
+                          out_types.data(), out_data.size()))
+      << "Error calling ForwardStateful for custom operator '" << name_str << "'";
+
       // return type void
     };
 
@@ -588,8 +604,7 @@ int MXLoadLib(const char *path) {
       if (fstateful_fp != nullptr) {
         regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state);
         regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda);
-      }
-      else {
+      } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
       }
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
@@ -619,8 +634,7 @@ int MXLoadLib(const char *path) {
       if (fstateful_fp != nullptr) {
         regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state, 11);
         regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda, 11);
-      }
-      else {
+      } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
       }
       regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);

From 82f1bffd6285a30cb03b4d1b04adbec79df01d26 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Tue, 10 Sep 2019 06:25:17 +0000
Subject: [PATCH 066/111] refactor stateful forward and add op resource

---
 example/lib_ops/subgraph_lib.cc | 28 +++-------
 include/mxnet/lib_api.h         | 94 ++++++++------------------------
 src/c_api/c_api.cc              | 97 +++++++++++++++++++--------------
 3 files changed, 86 insertions(+), 133 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 3c9d2bc4ff7d..16319ff8594e 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -69,7 +69,9 @@ class MyStatefulOp : public CustomStatefulOp {
  public:
   MyStatefulOp(std::string sym, int count) : subgraph_sym(sym), count(count) {}
 
-  void Forward(std::vector<MXTensor>& inputs, std::vector<MXTensor>& outputs) {
+  int Forward(std::vector<MXTensor>& inputs,
+              std::vector<MXTensor>& outputs,
+              OpResource op_res) {
     count++;
     float* input1 = inputs[0].getData<float>();
     float* output = outputs[0].getData<float>();
@@ -81,6 +83,10 @@ class MyStatefulOp : public CustomStatefulOp {
       }
     }
     std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
+    int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
+    *p = 42;
+    std::cout << *p << std::endl;
+    return MX_SUCCESS;
   }
 
   int State() { return count; }
@@ -93,7 +99,7 @@ class MyStatefulOp : public CustomStatefulOp {
 
 MXReturnValue createOpState(std::map<std::string,std::string> attrs,
                             CustomStatefulOp** op_inst) {
-  std::string serialized_subgraph = "[]";
+  std::string serialized_subgraph = "[empty]";
   if (attrs.count(SUBGRAPH)) {
     serialized_subgraph = attrs[SUBGRAPH];
   }
@@ -102,28 +108,12 @@ MXReturnValue createOpState(std::map<std::string,std::string> attrs,
   return MX_SUCCESS;
 }
 
-MXReturnValue forwardStateful(CustomStatefulOp* op_inst,
-                              std::vector<MXTensor> inputs,
-                              std::vector<MXTensor> outputs) {
-  // retrieve the statful op instance
-  MyStatefulOp* my_op_inst = static_cast<MyStatefulOp*>(op_inst);
-  if (my_op_inst == nullptr) {
-    std::cout << "stateful op loading failed" << std::endl;
-    return MX_FAIL;
-  }
-
-  my_op_inst->Forward(inputs, outputs);
-  std::cout << "forward op state run " << my_op_inst->State() << std::endl;
-  return MX_SUCCESS;
-}
-
 REGISTER_OP(subgraph_op)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape)
 .setMutateInputs(mutateInputs)
-.setCreateOpState(createOpState)
-.setForwardStateful(forwardStateful);
+.setCreateOpState(createOpState);
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c870f41d0107..ab8e36596fe3 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -98,17 +98,6 @@ class OpResource {
   void* _xpu_malloc;
 };
 
-/*!
- * \brief StatefulOp wrapper class to pass to backend OpState
- */
-class CustomStatefulOpWrapper {
- public:
-  CustomStatefulOpWrapper(void* inst) : instance(inst) {}
-  void* get_instance() { return instance; }
- private:
-  void* instance;
-};
-
 /*!
  * \brief Macro to help passing serialized subgraph through attribute dict
  */
@@ -119,11 +108,23 @@ class CustomStatefulOpWrapper {
  */
 class CustomStatefulOp {
  public:
-  virtual void Forward(std::vector<MXTensor>& inputs,
-                       std::vector<MXTensor>& outputs) = 0;
+  virtual int Forward(std::vector<MXTensor>& inputs,
+                      std::vector<MXTensor>& outputs,
+                      OpResource op_res) = 0;
   virtual ~CustomStatefulOp() = 0;
 };
 
+/*!
+ * \brief StatefulOp wrapper class to pass to backend OpState
+ */
+class CustomStatefulOpWrapper {
+ public:
+  explicit CustomStatefulOpWrapper(CustomStatefulOp* inst) : instance(inst) {}
+  CustomStatefulOp* get_instance() { return instance; }
+ private:
+  CustomStatefulOp* instance;
+};
+
 /*!
  * Custom Operator function templates
  */
@@ -141,8 +142,6 @@ typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
                                       std::vector<int>&);
 typedef MXReturnValue (*createOpState_t)(std::map<std::string, std::string>,
                                       CustomStatefulOp**);
-typedef MXReturnValue (*fstateful_t)(CustomStatefulOp*, std::vector<MXTensor>,
-                                      std::vector<MXTensor>);
 
 /*!
  * \brief Class to hold custom operator registration
@@ -151,7 +150,7 @@ class CustomOp {
  public:
   explicit CustomOp(const char* op_name) : name(op_name), fcompute(nullptr),
     fgradient(nullptr), parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
-    mutate_inputs(nullptr), create_op_state(nullptr), fstateful(nullptr) {}
+    mutate_inputs(nullptr), create_opstate(nullptr) {}
   ~CustomOp() {}
   CustomOp& setForward(fcomp_t fcomp) {
     fcompute = fcomp;
@@ -178,11 +177,7 @@ class CustomOp {
     return *this;
   }
   CustomOp& setCreateOpState(createOpState_t func) {
-    create_op_state = func;
-    return *this;
-  }
-  CustomOp& setForwardStateful(fstateful_t func) {
-    fstateful = func;
+    create_opstate = func;
     return *this;
   }
   /*! \brief operator name */
@@ -194,8 +189,7 @@ class CustomOp {
   inferType_t infer_type;
   inferShape_t infer_shape;
   mutateInputs_t mutate_inputs;
-  createOpState_t create_op_state;
-  fstateful_t fstateful;
+  createOpState_t create_opstate;
 };
 
 /*!
@@ -281,7 +275,7 @@ typedef int (*opRegSize_t)(void);
 typedef int (*opRegGet_t)(int, const char**, fcomp_t*, fcomp_t*,
                           parseAttrs_t*, inferType_t*,
                           inferShape_t*, mutateInputs_t*,
-                          createOpState_t*, fstateful_t*);
+                          createOpState_t*);
 
 #define MXLIB_OPCALLFREE_STR "_opCallFree"
 typedef int (*opCallFree_t)(void*);
@@ -313,11 +307,6 @@ typedef int (*opCallMutateInputs_t)(mutateInputs_t, const char* const*, const ch
 typedef int (*opCallCreateOpState_t)(createOpState_t, const char* const*, const char* const*, int,
                                      void**);
 
-#define MXLIB_OPCALLFSTATEFUL_STR "_opCallFStateful"
-typedef int (*opCallFStateful_t)(fstateful_t, void*,
-                                 const int64_t**, int*, void**, int*, int,
-                                 const int64_t**, int*, void**, int*, int);
-
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -360,7 +349,7 @@ extern "C" {
   _opRegGet(int idx, const char** name, fcomp_t* fcomp, fcomp_t* fgrad,
             parseAttrs_t* parse, inferType_t* type,
             inferShape_t* shape, mutateInputs_t* mutate,
-            createOpState_t* create_op, fstateful_t* fstateful) {
+            createOpState_t* create_op) {
     CustomOp op = Registry<CustomOp>::get()->get(idx);
     *name = op.name;
     *fcomp = op.fcompute;
@@ -369,8 +358,7 @@ extern "C" {
     *type = op.infer_type;
     *shape = op.infer_shape;
     *mutate = op.mutate_inputs;
-    *create_op = op.create_op_state;
-    *fstateful = op.fstateful;
+    *create_op = op.create_opstate;
   }
 
   /*!
@@ -582,8 +570,8 @@ extern "C" {
   int
 #endif
   _opCallCreateOpState(createOpState_t create_op, const char* const* keys,
-                    const char* const* vals, int num,
-                    void** state_op) {
+                       const char* const* vals, int num,
+                       void** state_op) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -595,44 +583,6 @@ extern "C" {
     return create_op(attrs, op_ptr);
   }
 
-  /*!
-   * \brief returns status of calling FStateful function for operator from library
-   */
-#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  __declspec(dllexport) int __cdecl
-#else
-  int
-#endif
-  _opCallFStateful(fstateful_t fstateful, void* state_op_inst,
-                  const int64_t** inshapes, int* indims,
-                  void** indata, int* intypes, int num_in,
-                  const int64_t** outshapes, int* outdims,
-                  void** outdata, int* outtypes, int num_out) {
-    // create a vector of tensors for inputs
-    std::vector<MXTensor> inputs(num_in);
-    for (int i = 0; i < num_in; i++) {
-      inputs[i].data = indata[i];
-      inputs[i].dtype = (MXDType)intypes[i];
-      for (int j = 0; j < indims[i]; j++) {
-        inputs[i].shape.push_back(inshapes[i][j]);
-      }
-    }
-
-    // create a vector of tensors for outputs
-    std::vector<MXTensor> outputs(num_out);
-    for (int i = 0; i < num_out; i++) {
-      outputs[i].data = outdata[i];
-      outputs[i].dtype = (MXDType) outtypes[i];
-      for (int j = 0; j < outdims[i]; j++) {
-        outputs[i].shape.push_back(outshapes[i][j]);
-      }
-    }
-
-    // pass the stateful op instance to stateful forward in custom library
-    CustomStatefulOp* state_op = reinterpret_cast<CustomStatefulOp*>(state_op_inst);
-    return fstateful(state_op, inputs, outputs);
-  }
-
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 867efe2cddfe..cf68f68675b8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -138,9 +138,6 @@ int MXLoadLib(const char *path) {
   opCallCreateOpState_t callCreateOpState =
     get_func<opCallCreateOpState_t>(lib, const_cast<char*>(MXLIB_OPCALLCREATEOPSTATE_STR));
 
-  opCallFStateful_t callFStateful =
-    get_func<opCallFStateful_t>(lib, const_cast<char*>(MXLIB_OPCALLFSTATEFUL_STR));
-
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
@@ -161,15 +158,14 @@ int MXLoadLib(const char *path) {
     inferShape_t shape_fp = nullptr;
     // optional attributes
     mutateInputs_t mutate_fp = nullptr;
-    createOpState_t create_op_state_fp = nullptr;
-    fstateful_t fstateful_fp = nullptr;
+    createOpState_t create_opstate_fp = nullptr;
 
     // get custom operator implemenation from the dynamic library
     opRegGet(i, &name, &fcomp_fp, &fgrad_fp, &parse_fp, &type_fp, &shape_fp,
-                &mutate_fp, &create_op_state_fp, &fstateful_fp);
+                &mutate_fp, &create_opstate_fp);
 
     // validate custom operator functions from the dynamic library
-    CHECK(fcomp_fp != nullptr || fstateful_fp != nullptr) << "Error loading '" << name
+    CHECK(fcomp_fp != nullptr || create_opstate_fp != nullptr) << "Error loading '" << name
                             << "' custom op, FCompute function was not set.";
     CHECK(parse_fp != nullptr) << "Error loading '" << name
                             << "' custom op, ParseAttrs function was not set.";
@@ -407,14 +403,12 @@ int MXLoadLib(const char *path) {
         return data.dptr_;
       };
 
-      typedef decltype(cpu_alloc) alloc_type;
-
       // create lambda without captures so that we can cast it to function pointer
       // this needs to be a lambda function so that we can do the decltype cast
+      typedef decltype(cpu_alloc) alloc_type;
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
         alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
-
         void* ptr = (*cpualloc)(size);
         return ptr;
       };
@@ -516,7 +510,7 @@ int MXLoadLib(const char *path) {
 
     // library author should implement and return a 'state' which points to an instance
     // in lambda we create OpStatePtr using the returned 'state'
-    auto create_op_state = [=] (const NodeAttrs& attrs,
+    auto create_opstate = [=] (const NodeAttrs& attrs,
                                 Context ctx,
                                 const std::vector<TShape>& in_shapes,
                                 const std::vector<int>& in_types) {
@@ -539,53 +533,72 @@ int MXLoadLib(const char *path) {
 
       // create a pointer to hold custom op state object
       void* state_op_inst = nullptr;
-      CHECK(callCreateOpState(create_op_state_fp, attr_keys.data(), attr_vals.data(),
+      CHECK(callCreateOpState(create_opstate_fp, attr_keys.data(), attr_vals.data(),
                               attr_keys.size(), &state_op_inst))
       << "Error calling CreateOpState for custom operator '" << name_str << "'";
 
       CHECK(state_op_inst != nullptr)
       << "Error custom library failed to create stateful operator '" << name_str << "'";
 
-      return OpStatePtr::Create<CustomStatefulOpWrapper>(state_op_inst);
+      CustomStatefulOp* state_op = reinterpret_cast<CustomStatefulOp*>(state_op_inst);
+      return OpStatePtr::Create<CustomStatefulOpWrapper>(state_op);
     };
 
     auto fstateful_forward_lambda = [=](const OpStatePtr& state_ptr,
-                              const OpContext& ctx,
-                              const std::vector<NDArray>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<NDArray>& outputs) {
-      std::vector<void*> in_data, out_data;
-      std::vector<const int64_t *> in_shapes, out_shapes;
-      std::vector<int> in_dims, out_dims;
-      std::vector<int> in_types, out_types;
-
-      // convert input tensors to constituent parts
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+      // create a vector of tensors for inputs
+      std::vector<MXTensor> c_inputs(inputs.size());
       for (size_t i = 0; i < inputs.size(); i++) {
-        in_data.push_back(inputs[i].data().dptr_);
-        in_shapes.push_back(inputs[i].shape().data());
-        in_dims.push_back(inputs[i].shape().ndim());
-        in_types.push_back(inputs[i].dtype());
+        c_inputs[i].data = inputs[i].data().dptr_;
+        c_inputs[i].dtype = (MXDType)inputs[i].dtype();
+        for (int_least16_t j = 0; j < inputs[i].shape().ndim(); j++) {
+          c_inputs[i].shape.push_back(inputs[i].shape().data()[j]);
+        }
       }
 
-      // convert output tensors to constituent parts
+      // create a vector of tensors for outputs
+      std::vector<MXTensor> c_outputs(outputs.size());
       for (size_t i = 0; i < outputs.size(); i++) {
-        out_data.push_back(outputs[i].data().dptr_);
-        out_shapes.push_back(outputs[i].shape().data());
-        out_dims.push_back(outputs[i].shape().ndim());
-        out_types.push_back(outputs[i].dtype());
+        c_outputs[i].data = outputs[i].data().dptr_;
+        c_outputs[i].dtype = (MXDType)outputs[i].dtype();
+        for (int j = 0; j < outputs[i].shape().ndim(); j++) {
+          c_outputs[i].shape.push_back(outputs[i].shape().data()[j]);
+        }
       }
 
+      // get memory resource
+      const Resource &resource = ctx.requested[0];
+      mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
+
+      // create lambda that captures stream & resource objects
+      auto cpu_alloc = [&](int size) {
+        mshadow::Tensor<mxnet::cpu, 1, char> data =
+        resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
+        return data.dptr_;
+      };
+
+      // create lambda without captures so that we can cast it to function pointer
+      // this needs to be a lambda function so that we can do the decltype cast
+      typedef decltype(cpu_alloc) alloc_type;
+      auto cpu_malloc = [](void* _cpu_alloc, int size) {
+        // cast the void* argument to the type for the cpu_alloc lambda function
+        alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
+        void* ptr = (*cpualloc)(size);
+        return ptr;
+      };
+
+      OpResource op_res(cpu_malloc, &cpu_alloc);
+
       // retrieve op state object created from CreateOpState
       CustomStatefulOpWrapper& op = state_ptr.get_state<CustomStatefulOpWrapper>();
-      void* state_op_inst = op.get_instance();
+      CustomStatefulOp* state_op_inst = op.get_instance();
       CHECK(state_op_inst != nullptr)
       << "Error MXNet cannot load custom stateful operator'" << name_str << "'";
 
-      CHECK(callFStateful(fstateful_fp, state_op_inst,
-                          in_shapes.data(), in_dims.data(), in_data.data(),
-                          in_types.data(), in_data.size(),
-                          out_shapes.data(), out_dims.data(), out_data.data(),
-                          out_types.data(), out_data.size()))
+      CHECK(state_op_inst->Forward(c_inputs, c_outputs, op_res))
       << "Error calling ForwardStateful for custom operator '" << name_str << "'";
 
       // return type void
@@ -601,8 +614,8 @@ int MXLoadLib(const char *path) {
     regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
     if (regOpPtr == nullptr) {
       // re-register op in MXNet using lambda converter functions
-      if (fstateful_fp != nullptr) {
-        regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state);
+      if (create_opstate_fp != nullptr) {
+        regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate);
         regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda);
       } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
@@ -631,8 +644,8 @@ int MXLoadLib(const char *path) {
       regOp.arguments.clear();
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
-      if (fstateful_fp != nullptr) {
-        regOp.set_attr<FCreateOpState>("FCreateOpState", create_op_state, 11);
+      if (create_opstate_fp != nullptr) {
+        regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, 11);
         regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda, 11);
       } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);

From ba563d2a6795db5c95256c170701e7fd3bc1addd Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 10 Sep 2019 06:30:12 +0000
Subject: [PATCH 067/111] wip gemm backward

---
 example/lib_ops/mylib.cc | 63 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/mylib.cc
index 5d09760d7b74..d338b3a586e7 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/mylib.cc
@@ -42,7 +42,21 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   }
 }
 
+void transpose(float* A, float* At, unsigned n, unsigned m) {
+  unsigned i,j;
+  for (i=0; i < n; i++) {
+    for (j=0; j < m; j++) {
+      At[i][j] = A[j][i];
+    }
+  }
+}
 
+/*
+ * Executes C = A * B
+ * inputs[0] = A
+ * inputs[1] = B
+ * outputs[0] = C
+ */
 MXReturnValue forward(std::map<std::string,std::string> attrs,
                std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
                OpResource res) {
@@ -55,15 +69,56 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
   }
   
   //extract data pointers from tensors
-  float* input1 = inputs[0].getData<float>();
-  float* input2 = inputs[1].getData<float>();
-  float* output = outputs[0].getData<float>();
+  float* A = inputs[0].getData<float>();
+  float* B = inputs[1].getData<float>();
+  float* C = outputs[0].getData<float>();
+  //set tensor shapes
+  unsigned n = inputs[0].shape[0];
+  unsigned k = inputs[0].shape[1];
+  unsigned m = inputs[1].shape[1];
+
+  gemm(A, B, C, n, k, m);
+  
+  return MX_SUCCESS;
+}
+
+/*
+ * Executes dA = dC * B.T
+ * Executes dB = A.T * dC
+ ***** gradient inputs
+ * inputs[0] = dC
+ ***** original inputs
+ * inputs[1] = A
+ * inputs[2] = B
+ ***** original outputs
+ * inputs[3] = C
+ ***** gradient outputs
+ * outputs[0] = dA
+ * outputs[1] = dB
+ */
+MXReturnValue backward(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
+  //validate inputs
+  for(int i=0; i<inputs.size(); i++) {
+    if(inputs[i].dtype != kFloat32) {
+      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
+      return MX_FAIL;
+    }
+  }
+  
+  //extract data pointers from tensors
+  float* dC = inputs[0].getData<float>();
+  float* A = inputs[1].getData<float>();
+  float* B = inputs[2].getData<float>();
+  float* dA = outputs[0].getData<float>();
+  float* dB = outputs[1].getData<float>();
   //set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
   unsigned m = inputs[1].shape[1];
 
-  gemm(input1, input2, output, n, k, m);
+  //gemm(input1, input2, output, n, k, m);
   
   return MX_SUCCESS;
 }

From 7bf4f7a470052f1ce8628dde9e4cb9f6714f804c Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Wed, 11 Sep 2019 20:46:09 +0000
Subject: [PATCH 068/111] stateful backward and subgraph test

---
 example/lib_ops/subgraph_lib.cc  |  22 ++++-
 example/lib_ops/test_subgraph.py |  10 ++-
 include/mxnet/lib_api.h          |   4 +-
 src/c_api/c_api.cc               | 145 +++++++++++--------------------
 4 files changed, 82 insertions(+), 99 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index fbb8845780c1..921832d12d02 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -89,8 +89,8 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
 
 MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
                std::vector<int> &input_indices) {
-  input_indices.push_back(1);
-  std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+  //input_indices.push_back(1);
+  //std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
   return MX_SUCCESS;
 }
 
@@ -118,6 +118,24 @@ class MyStatefulOp : public CustomStatefulOp {
     return MX_SUCCESS;
   }
 
+  int Backward(std::vector<MXTensor>& inputs,
+               std::vector<MXTensor>& outputs,
+               OpResource op_res) {
+    std::cout << "subgraph " << subgraph_sym << " backwarding" << std::endl;
+    float* input = inputs[0].getData<float>();
+    float* output1 = outputs[0].getData<float>();
+    float* output2 = outputs[1].getData<float>();
+    unsigned n = inputs[0].shape[0];
+    unsigned m = inputs[0].shape[1];
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < m; j++) {
+        output1[i * m + j] = input[i * m + j] + 58;
+        output2[i * m + j] = input[i * m + j] + 59;
+      }
+    }
+    return MX_SUCCESS;
+  }
+
   int State() { return count; }
   ~MyStatefulOp() {}
 
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index 6af60a6e0263..2d72f793fb96 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -45,7 +45,13 @@
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
 c = mx.sym.subgraph_op(s,t)
-exe = c.bind(ctx=mx.cpu(),args={'s':a},aux_states={'t':b})
+in_grad = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
+#exe = c.bind(ctx=mx.cpu(),args={'s':a},args_grad=in_grad,aux_states={'t':b})
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad)
 out = exe.forward()
 out = exe.forward()
-
+print(out)
+print("-----------------")
+out_grad = mx.nd.ones((2,2))
+exe.backward([out_grad])
+print(in_grad)
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 1b16aecd0d89..a9a9919a336c 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -105,7 +105,9 @@ class OpResource {
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
 
 /*!
- * \brief An prototype interface class for library author creating stateful op
+ * \brief An abstract class for library author creating stateful op
+ * custom library should override Forward and destructor, and has an
+ * option to implement Backward
  */
 class CustomStatefulOp {
  public:
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index efd416fb89f5..9e9c819b42cb 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -480,15 +480,8 @@ int MXLoadLib(const char *path) {
                                      dispatch_mode, DispatchMode::kFComputeEx);
     };
 
-    /*
-     * GradStruct
-     * this struct sets that the operator will use both the inputs and the outputs to compute
-     * the gradient. The order is: [grads, inputs, outputs]
-     */
-    struct GradStruct {
-      const char *op_name;
-      std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                              const std::vector<nnvm::NodeEntry>& ograds) const {
+    // FGradient register lambda
+    auto grad_reg = [=](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
         // copy gradients first
         std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
         // copy inputs second
@@ -500,8 +493,8 @@ int MXLoadLib(const char *path) {
         for (uint32_t i = 0; i < n_out; ++i) {
           heads.emplace_back(n, i, 0);
         }
-        return mxnet::op::MakeGradNode(op_name, n, heads, n->attrs.dict);
-      }
+        std::string grad_name = "_backward_" + name_str;
+        return mxnet::op::MakeGradNode(grad_name.c_str(), n, heads, n->attrs.dict);
     };
 
     auto resc_req = [=](const NodeAttrs& attrs) {
@@ -544,11 +537,13 @@ int MXLoadLib(const char *path) {
       return OpStatePtr::Create<CustomStatefulOpWrapper>(state_op);
     };
 
-    auto fstateful_forward_lambda = [=](const OpStatePtr& state_ptr,
-                                        const OpContext& ctx,
-                                        const std::vector<NDArray>& inputs,
-                                        const std::vector<OpReqType>& req,
-                                        const std::vector<NDArray>& outputs) {
+    // stateful forward and backward
+    auto fstateful_lambda = [=](bool forward,
+                                const OpStatePtr& state_ptr,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
       // create a vector of tensors for inputs
       std::vector<MXTensor> c_inputs(inputs.size());
       for (size_t i = 0; i < inputs.size(); i++) {
@@ -598,70 +593,30 @@ int MXLoadLib(const char *path) {
       CHECK(state_op_inst != nullptr)
       << "Error MXNet cannot load custom stateful operator'" << name_str << "'";
 
-      CHECK(state_op_inst->Forward(c_inputs, c_outputs, op_res))
-      << "Error calling ForwardStateful for custom operator '" << name_str << "'";
-
+      if (forward) {
+        CHECK(state_op_inst->Forward(c_inputs, c_outputs, op_res))
+        << "Error calling ForwardStateful for custom operator '" << name_str << "'";
+      } else {
+        CHECK(state_op_inst->Backward(c_inputs, c_outputs, op_res))
+        << "Error calling BackwardStateful for custom operator '" << name_str << "'";
+      }
       // return type void
     };
 
-    auto fstateful_backward_lambda = [=](const OpStatePtr& state_ptr,
-                                         const OpContext& ctx,
-                                         const std::vector<NDArray>& inputs,
-                                         const std::vector<OpReqType>& req,
-                                         const std::vector<NDArray>& outputs) {
-      // create a vector of tensors for inputs
-      std::vector<MXTensor> c_inputs(inputs.size());
-      for (size_t i = 0; i < inputs.size(); i++) {
-        c_inputs[i].data = inputs[i].data().dptr_;
-        c_inputs[i].dtype = (MXDType)inputs[i].dtype();
-        for (int_least16_t j = 0; j < inputs[i].shape().ndim(); j++) {
-          c_inputs[i].shape.push_back(inputs[i].shape().data()[j]);
-        }
-      }
-
-      // create a vector of tensors for outputs
-      std::vector<MXTensor> c_outputs(outputs.size());
-      for (size_t i = 0; i < outputs.size(); i++) {
-        c_outputs[i].data = outputs[i].data().dptr_;
-        c_outputs[i].dtype = (MXDType)outputs[i].dtype();
-        for (int j = 0; j < outputs[i].shape().ndim(); j++) {
-          c_outputs[i].shape.push_back(outputs[i].shape().data()[j]);
-        }
-      }
-
-      // get memory resource
-      const Resource &resource = ctx.requested[0];
-      mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
-
-      // create lambda that captures stream & resource objects
-      auto cpu_alloc = [&](int size) {
-        mshadow::Tensor<mxnet::cpu, 1, char> data =
-        resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
-        return data.dptr_;
-      };
-
-      // create lambda without captures so that we can cast it to function pointer
-      // this needs to be a lambda function so that we can do the decltype cast
-      typedef decltype(cpu_alloc) alloc_type;
-      auto cpu_malloc = [](void* _cpu_alloc, int size) {
-        // cast the void* argument to the type for the cpu_alloc lambda function
-        alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
-        void* ptr = (*cpualloc)(size);
-        return ptr;
-      };
-
-      OpResource op_res(cpu_malloc, &cpu_alloc);
-
-      // retrieve op state object created from CreateOpState
-      CustomStatefulOpWrapper& op = state_ptr.get_state<CustomStatefulOpWrapper>();
-      CustomStatefulOp* state_op_inst = op.get_instance();
-      CHECK(state_op_inst != nullptr)
-      << "Error MXNet cannot load custom stateful operator'" << name_str << "'";
-
-      CHECK(state_op_inst->Backward(c_inputs, c_outputs, op_res))
-      << "Error calling BackwardStateful for custom operator '" << name_str << "'";
+    auto fstateful_forward = [=](const OpStatePtr& state_ptr,
+                                 const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
+      fstateful_lambda(true, state_ptr, ctx, inputs, req, outputs);
+    };
 
-      // return type void
+    auto fstateful_backward = [=](const OpStatePtr& state_ptr,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+      fstateful_lambda(false, state_ptr, ctx, inputs, req, outputs);
     };
 
     // check if operator is already registered
@@ -679,7 +634,7 @@ int MXLoadLib(const char *path) {
 
       if (create_opstate_fp != nullptr) {
         regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate);
-        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda);
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward);
       } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
       }
@@ -688,21 +643,22 @@ int MXLoadLib(const char *path) {
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
       // optionally add fgradient if user specified a function
-      if (fgrad_fp != nullptr) {
-        std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()});
-
+      if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
+        regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg);
+        std::string grad_name = "_backward_" + name_str;
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
         gradOp.set_attr_parser(attr_parser);
         gradOp.set_num_inputs(num_inouts);
-        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_num_outputs(num_inputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
-        if (create_opstate_fp != nullptr)
-          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward_lambda);
-        else
+        if (create_opstate_fp != nullptr) {
+          gradOp.set_attr<bool>("TIsLayerOpBackward", true);
+          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward);
+        } else {
           gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda);
+        }
       }
     } else {
       // overwrite registration of existing op with custom op
@@ -717,7 +673,7 @@ int MXLoadLib(const char *path) {
 
       if (create_opstate_fp != nullptr) {
         regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, 11);
-        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward_lambda, 11);
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward, 11);
       } else {
         regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
       }
@@ -726,21 +682,22 @@ int MXLoadLib(const char *path) {
       if (mutate_fp != nullptr)
         regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
       // optionally add fgradient if user specified a function
-      if (fgrad_fp != nullptr) {
-        std::string grad_name(std::string("_backward_") + name);
-        regOp.set_attr<nnvm::FGradient>("FGradient", GradStruct{grad_name.c_str()}, 11);
-
+      if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
+        regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, 11);
+        std::string grad_name = "_backward_" + name_str;
         nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
         gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
         gradOp.set_attr_parser(attr_parser);
         gradOp.set_num_inputs(num_inouts);
-        gradOp.set_num_outputs(num_outputs);
+        gradOp.set_num_outputs(num_inputs);
         gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
         gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
-        if (create_opstate_fp != nullptr)
-          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward_lambda, 11);
-        else
+        if (create_opstate_fp != nullptr) {
+          gradOp.set_attr<bool>("TIsLayerOpBackward", true, 11);
+          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward, 11);
+        } else {
           gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda, 11);
+        }
       }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");

From 8aec7acd436a92ba9d5c6badb433c19f2642f8e7 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Thu, 12 Sep 2019 19:17:10 +0000
Subject: [PATCH 069/111] implement gemm and state gemm, refactor test files

---
 CMakeLists.txt                            |   2 +-
 Makefile                                  |   2 +-
 example/lib_ops/Makefile                  |  12 +--
 example/lib_ops/{mylib.cc => gemm_lib.cc} | 113 ++++++++++++++++------
 example/lib_ops/libtest.cc                |  78 ---------------
 example/lib_ops/subgraph_lib.cc           |  72 ++------------
 example/lib_ops/{test.py => test_gemm.py} |  36 +++++--
 example/lib_ops/test_subgraph.py          |   4 -
 example/lib_ops/test_warpctc.py           |  32 ++++++
 example/lib_ops/warpctc_lib.cc            |  75 ++++++++++++++
 include/mxnet/lib_api.h                   |  12 +--
 11 files changed, 238 insertions(+), 200 deletions(-)
 rename example/lib_ops/{mylib.cc => gemm_lib.cc} (67%)
 delete mode 100644 example/lib_ops/libtest.cc
 rename example/lib_ops/{test.py => test_gemm.py} (64%)
 create mode 100644 example/lib_ops/test_warpctc.py
 create mode 100644 example/lib_ops/warpctc_lib.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7182bfb7bd90..80f49588cd4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -693,7 +693,7 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_ops/mylib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_ops/gemm_lib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
diff --git a/Makefile b/Makefile
index d071ef3170e5..eb6d4a5f5d51 100644
--- a/Makefile
+++ b/Makefile
@@ -665,7 +665,7 @@ pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
 sample_lib:
-	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/mylib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
 doc: docs
 
diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
index 628f09aadc4c..ce130d826a03 100644
--- a/example/lib_ops/Makefile
+++ b/example/lib_ops/Makefile
@@ -15,17 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-all: warpctc_lib subgraph_lib
+all: warpctc_lib subgraph_lib gemm_lib
+
+gemm_lib:
+	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o gemm_lib.so -I ../../include/mxnet
 
 warpctc_lib:
-	g++ -shared -fPIC -std=gnu++0x mylib.cc -o mylib.so -I ../../include/mxnet
+	g++ -shared -fPIC -std=gnu++0x warpctc_lib.cc -o warpctc_lib.so -I ../../include/mxnet
 
 subgraph_lib:
 	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
 
-test:
-	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
-
 windows:
 	cl /LD mylib.cc
 
@@ -33,4 +33,4 @@ win_test:
 	cl libtest.cc
 
 clean:
-	rm -rf mylib.so libtest
+	rm -rf *.so libtest
diff --git a/example/lib_ops/mylib.cc b/example/lib_ops/gemm_lib.cc
similarity index 67%
rename from example/lib_ops/mylib.cc
rename to example/lib_ops/gemm_lib.cc
index 404c3512a7b1..98d67ebd0437 100644
--- a/example/lib_ops/mylib.cc
+++ b/example/lib_ops/gemm_lib.cc
@@ -58,8 +58,9 @@ void transpose(float* A, float* At, unsigned n, unsigned m) {
  * outputs[0] = C
  */
 MXReturnValue forward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
+                      std::vector<MXTensor> inputs,
+                      std::vector<MXTensor> outputs,
+                      OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
@@ -67,7 +68,7 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
       return MX_FAIL;
     }
   }
-  
+
   //extract data pointers from tensors
   float* A = inputs[0].getData<float>();
   float* B = inputs[1].getData<float>();
@@ -79,7 +80,7 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
   unsigned m = inputs[1].shape[1];
 
   gemm(A, B, C, n, k, m);
-  
+
   return MX_SUCCESS;
 }
 
@@ -98,8 +99,9 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
  * outputs[1] = dB
  */
 MXReturnValue backward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
+                       std::vector<MXTensor> inputs,
+                       std::vector<MXTensor> outputs,
+                       OpResource res) {
   //validate inputs
   for(int i=0; i<inputs.size(); i++) {
     if(inputs[i].dtype != kFloat32) {
@@ -107,7 +109,7 @@ MXReturnValue backward(std::map<std::string,std::string> attrs,
       return MX_FAIL;
     }
   }
-  
+
   //extract data pointers from tensors
   float* dC = inputs[0].getData<float>();
   float* A = inputs[1].getData<float>();
@@ -115,31 +117,30 @@ MXReturnValue backward(std::map<std::string,std::string> attrs,
   float* dA = outputs[0].getData<float>();
   float* dB = outputs[1].getData<float>();
   //set tensor shapes
-  unsigned n = inputs[0].shape[0];
-  unsigned k = inputs[0].shape[1];
-  unsigned m = inputs[1].shape[1];
+  unsigned n = inputs[1].shape[0];
+  unsigned k = inputs[1].shape[1];
+  unsigned m = inputs[2].shape[1];
+
+  std::cout << "n: " << n << " k: " << k << " m: " << m << std::endl;
+
+  float At[n*k], Bt[k*m];
+  transpose(A, At, n, k);
+  transpose(B, Bt, k, m);
+
+  gemm(dC, Bt, dA, n, k, m);
+  gemm(At, dC, dB, n, k, m);
 
-  //gemm(input1, input2, output, n, k, m);
-  
   return MX_SUCCESS;
 }
 
-MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
-               int* num_in, int* num_out) {
-  /*
-  if(attrs.find("myParam") == attrs.end()) {
-    std::cout << "Missing param 'myParam'" << std::endl;
-    return 0;
-  }
-  */
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs, int* num_in, int* num_out) {
   *num_in = 2;
   *num_out = 1;
-
   return MX_SUCCESS;
 }
 
 MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
-              std::vector<int> &outtypes) {
+                        std::vector<int> &outtypes) {
   // validate inputs
   if (intypes.size() != 2) {
     std::cout << "Expected 2 inputs to inferType" << std::endl;
@@ -159,8 +160,9 @@ MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int
   return MX_SUCCESS;
 }
 
-MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
-               std::vector<std::vector<unsigned int>> &outshapes) {
+MXReturnValue inferShape(std::map<std::string,std::string> attrs,
+                         std::vector<std::vector<unsigned int>> &inshapes,
+                         std::vector<std::vector<unsigned int>> &outshapes) {
   // validate inputs
   if (inshapes.size() != 2) {
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
@@ -176,7 +178,7 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
     return MX_FAIL;
   }
-  
+
   unsigned n = inshapes[0][0];
   unsigned k = inshapes[0][1];
   unsigned kk = inshapes[1][0];
@@ -184,27 +186,76 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
 
   std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
-  
+
   if (k != kk)
     return MX_FAIL;
-  
+
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
 
   return MX_SUCCESS;
 }
 
-REGISTER_OP(gemm)
+REGISTER_OP(my_gemm)
 .setForward(forward)
+.setBackward(backward)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
-REGISTER_OP(warpctc)
-.setForward(forward)
+
+/* ------------------------------------------------------------------------- */
+
+MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
+               std::vector<int> &input_indices) {
+  //input_indices.push_back(1);
+  //std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+  return MX_SUCCESS;
+}
+
+class MyStatefulGemm : public CustomStatefulOp {
+ public:
+  explicit MyStatefulGemm(int count) : count(count) {}
+
+  MXReturnValue Forward(std::vector<MXTensor> inputs,
+              std::vector<MXTensor> outputs,
+              OpResource op_res) {
+    count++;
+    int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
+    *p = count;
+    std::cout << "test op resource " << *p << std::endl;
+
+    std::map<std::string,std::string> attrs;
+    return forward(attrs, inputs, outputs, op_res);
+  }
+
+  MXReturnValue Backward(std::vector<MXTensor> inputs,
+               std::vector<MXTensor> outputs,
+               OpResource op_res) {
+    std::map<std::string,std::string> attrs;
+    return backward(attrs, inputs, outputs, op_res);
+  }
+
+  ~MyStatefulGemm() {}
+
+ private:
+  int count;
+};
+
+MXReturnValue createOpState(std::map<std::string,std::string> attrs,
+                            CustomStatefulOp** op_inst) {
+  *op_inst = new MyStatefulGemm(58);
+  std::cout << "create op state successful" << std::endl;
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(state_gemm)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
-.setInferShape(inferShape);
+.setInferShape(inferShape)
+.setMutateInputs(mutateInputs)
+.setCreateOpState(createOpState);
+
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/example/lib_ops/libtest.cc b/example/lib_ops/libtest.cc
deleted file mode 100644
index 9fcdda55c64f..000000000000
--- a/example/lib_ops/libtest.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2019 by Contributors
- * \file libtest.cc
- * \brief This test checks if the library is implemented correctly
- * and does not involve dynamic loading of library into MXNet
- * This test is supposed to be run before test.py
- */
-
-#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#include <iostream>
-#include "lib_api.h"
-
-#define MXNET_VERSION 10500
-
-int main(void) {
-  // Get a handle to the library.
-#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  HINSTANCE handle;
-  handle = LoadLibrary(TEXT("mylib.dll"));
-#else
-  void *handle;
-  handle = dlopen("mylib.so", RTLD_LAZY);
-#endif
-
-  if (!handle) {
-    std::cerr << "Unable to load library" << std::endl;
-    return 1;
-  }
-
-  // get initialize function address from the library
-  initialize_t init_lib;
-#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  init_lib = (initialize_t) GetProcAddress(handle, MXLIB_INITIALIZE_STR);
-#else
-  init_lib = (initialize_t) dlsym(handle, MXLIB_INITIALIZE_STR);
-#endif
-
-  if (!init_lib) {
-    std::cerr << "Unable to get function 'intialize' from library" << std::endl;
-    return 1;
-  }
-
-  // Call the function.
-  (init_lib)(MXNET_VERSION);
-
-  // Deallocate memory.
-#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
-  FreeLibrary(handle);
-#else
-  dlclose(handle);
-#endif
-
-  return 0;
-}
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 921832d12d02..13e124c9c7d8 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -27,18 +27,6 @@
 #include <iostream>
 #include "lib_api.h"
 
-void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
-  unsigned i,j,kk;
-  for (i=0;i<n;i++) {
-    for (j=0;j<m;j++) {
-      C[i*m+j] = 0;
-      for (kk=0;kk<k;kk++) {
-        C[i*m+j] += A[i*k+kk] * B[kk*m+j];
-      }
-    }
-  }
-}
-
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                int* num_in, int* num_out) {
   *num_in = 2;
@@ -70,23 +58,6 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<st
   return MX_SUCCESS;
 }
 
-MXReturnValue forward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
-  //extract data pointers from tensors
-  float* input1 = inputs[0].getData<float>();
-  float* input2 = inputs[1].getData<float>();
-  float* output = outputs[0].getData<float>();
-  //set tensor shapes
-  unsigned n = inputs[0].shape[0];
-  unsigned k = inputs[0].shape[1];
-  unsigned m = inputs[1].shape[1];
-
-  gemm(input1, input2, output, n, k, m);
-
-  return MX_SUCCESS;
-}
-
 MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
                std::vector<int> &input_indices) {
   //input_indices.push_back(1);
@@ -96,43 +67,12 @@ MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
 
 class MyStatefulOp : public CustomStatefulOp {
  public:
-  MyStatefulOp(std::string sym, int count) : subgraph_sym(sym), count(count) {}
-
-  int Forward(std::vector<MXTensor>& inputs,
-              std::vector<MXTensor>& outputs,
-              OpResource op_res) {
-    count++;
-    float* input1 = inputs[0].getData<float>();
-    float* output = outputs[0].getData<float>();
-    unsigned n = inputs[0].shape[0];
-    unsigned m = inputs[0].shape[1];
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < m; j++) {
-        output[i * m + j] = input1[i * m + j] + count;
-      }
-    }
-    std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
-    int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
-    *p = 42;
-    std::cout << *p << std::endl;
-    return MX_SUCCESS;
-  }
+  explicit MyStatefulOp(std::string sym) : subgraph_sym(sym){}
 
-  int Backward(std::vector<MXTensor>& inputs,
-               std::vector<MXTensor>& outputs,
-               OpResource op_res) {
-    std::cout << "subgraph " << subgraph_sym << " backwarding" << std::endl;
-    float* input = inputs[0].getData<float>();
-    float* output1 = outputs[0].getData<float>();
-    float* output2 = outputs[1].getData<float>();
-    unsigned n = inputs[0].shape[0];
-    unsigned m = inputs[0].shape[1];
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < m; j++) {
-        output1[i * m + j] = input[i * m + j] + 58;
-        output2[i * m + j] = input[i * m + j] + 59;
-      }
-    }
+  MXReturnValue Forward(std::vector<MXTensor> inputs,
+                        std::vector<MXTensor> outputs,
+                        OpResource op_res) {
+    std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
     return MX_SUCCESS;
   }
 
@@ -150,7 +90,7 @@ MXReturnValue createOpState(std::map<std::string,std::string> attrs,
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
     serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
   }
-  *op_inst = new MyStatefulOp(serialized_subgraph, 0);
+  *op_inst = new MyStatefulOp(serialized_subgraph);
   std::cout << "create op state successful" << std::endl;
   return MX_SUCCESS;
 }
diff --git a/example/lib_ops/test.py b/example/lib_ops/test_gemm.py
similarity index 64%
rename from example/lib_ops/test.py
rename to example/lib_ops/test_gemm.py
index d1027faa71bb..0adcad4981c2 100644
--- a/example/lib_ops/test.py
+++ b/example/lib_ops/test_gemm.py
@@ -28,10 +28,10 @@
 
 #load library
 if (os.name=='posix'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('gemm_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('gemm_lib.so')
     mx.library.load(path)
 
 #setup inputs to call test operator
@@ -41,15 +41,37 @@
 #print inputs
 print(a)
 print(b)
-print('--------------')
 
 #compute and print output
-print(mx.nd.gemm(a,b))
+print("--------start ndarray---------")
+print(mx.nd.my_gemm(a,b))
+print(mx.nd.state_gemm(a,b))
 
-# symbol api
+# symbolic compute
+print("---------start symbol--------")
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
-c = mx.sym.warpctc(s,t)
-exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b})
+c = mx.sym.my_gemm(s,t)
+d = mx.sym.state_gemm(s,t)
+
+in_grad = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
+in_grad2 = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
+
+exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad)
+exe2 = d.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad2)
+
 out = exe.forward()
 print(out)
+
+out2 = exe2.forward()
+out2 = exe2.forward()
+print(out2)
+
+print("---------start backward--------")
+out_grad = mx.nd.ones((2,2))
+exe.backward([out_grad])
+print(in_grad)
+
+out_grad2 = mx.nd.ones((2,2))
+exe2.backward([out_grad2])
+print(in_grad2)
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index 2d72f793fb96..ccfe30d8f8ef 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -51,7 +51,3 @@
 out = exe.forward()
 out = exe.forward()
 print(out)
-print("-----------------")
-out_grad = mx.nd.ones((2,2))
-exe.backward([out_grad])
-print(in_grad)
diff --git a/example/lib_ops/test_warpctc.py b/example/lib_ops/test_warpctc.py
new file mode 100644
index 000000000000..2ade73b8f9ca
--- /dev/null
+++ b/example/lib_ops/test_warpctc.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=arguments-differ
+
+import mxnet as mx
+import os
+
+#load library
+if (os.name=='posix'):
+    path = os.path.abspath('warpctc_lib.so')
+    mx.library.load(path)
+elif (os.name=='nt'):
+    path = os.path.abspath('warpctc_lib.so')
+    mx.library.load(path)
diff --git a/example/lib_ops/warpctc_lib.cc b/example/lib_ops/warpctc_lib.cc
new file mode 100644
index 000000000000..05e19683a272
--- /dev/null
+++ b/example/lib_ops/warpctc_lib.cc
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mylib.cc
+ * \brief Sample custom operator implementation
+ * library file
+ */
+
+#include <iostream>
+#include "lib_api.h"
+
+MXReturnValue forward(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
+  return MX_SUCCESS;
+}
+
+
+MXReturnValue backward(std::map<std::string,std::string> attrs,
+               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
+               OpResource res) {
+  return MX_SUCCESS;
+}
+
+MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
+               int* num_in, int* num_out) {
+  *num_in = 2;
+  *num_out = 1;
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+              std::vector<int> &outtypes) {
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
+               std::vector<std::vector<unsigned int>> &outshapes) {
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(warpctc)
+.setForward(forward)
+.setBackward(backward)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10400) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    std::cout << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index a9a9919a336c..40e4cf921b28 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -111,12 +111,12 @@ class OpResource {
  */
 class CustomStatefulOp {
  public:
-  virtual int Forward(std::vector<MXTensor>& inputs,
-                      std::vector<MXTensor>& outputs,
-                      OpResource op_res) = 0;
-  virtual int Backward(std::vector<MXTensor>& inputs,
-                       std::vector<MXTensor>& outputs,
-                       OpResource op_res) {
+  virtual MXReturnValue Forward(std::vector<MXTensor> inputs,
+                                std::vector<MXTensor> outputs,
+                                OpResource op_res) = 0;
+  virtual MXReturnValue Backward(std::vector<MXTensor> inputs,
+                                 std::vector<MXTensor> outputs,
+                                 OpResource op_res) {
     std::cout << "Error! Operator does not support backward" << std::endl;
     return MX_FAIL;
   }

From 39e3d6b84c9059fb00340a6a1d93602f5b02da01 Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Thu, 12 Sep 2019 20:10:59 +0000
Subject: [PATCH 070/111] add body to pure virtual destructor

---
 include/mxnet/lib_api.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 40e4cf921b28..35af7a3b68a0 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -123,6 +123,8 @@ class CustomStatefulOp {
   virtual ~CustomStatefulOp() = 0;
 };
 
+CustomStatefulOp::~CustomStatefulOp() {}
+
 /*!
  * \brief StatefulOp wrapper class to pass to backend OpState
  */

From b3ba0287928fdaf0c8a4c53c6a53f5fe4fda7e1e Mon Sep 17 00:00:00 2001
From: rondogency <ziyi.mu@columbia.edu>
Date: Mon, 23 Sep 2019 02:44:13 +0000
Subject: [PATCH 071/111] subgraph passing from python to custom lib

---
 example/lib_ops/subgraph_lib.cc  | 45 +++++++++-----------------------
 example/lib_ops/test_subgraph.py | 36 ++++++++++++++++---------
 src/c_api/c_api.cc               |  8 +++---
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 13e124c9c7d8..eb1bfe6f6c57 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -28,40 +28,23 @@
 #include "lib_api.h"
 
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
-               int* num_in, int* num_out) {
-  *num_in = 2;
+                         int* num_in, int* num_out) {
+  *num_in = 1;
   *num_out = 1;
   return MX_SUCCESS;
 }
 
-MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
-              std::vector<int> &outtypes) {
+MXReturnValue inferType(std::map<std::string,std::string> attrs,
+                        std::vector<int> &intypes,
+                        std::vector<int> &outtypes) {
   outtypes[0] = intypes[0];
   return MX_SUCCESS;
 }
 
-MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
-               std::vector<std::vector<unsigned int>> &outshapes) {
-  unsigned n = inshapes[0][0];
-  unsigned k = inshapes[0][1];
-  unsigned kk = inshapes[1][0];
-  unsigned m = inshapes[1][1];
-
-  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
-  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
-
-  if (k != kk)
-    return MX_FAIL;
-
-  outshapes[0].push_back(n);
-  outshapes[0].push_back(m);
-  return MX_SUCCESS;
-}
-
-MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
-               std::vector<int> &input_indices) {
-  //input_indices.push_back(1);
-  //std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+MXReturnValue inferShape(std::map<std::string,std::string> attrs,
+                         std::vector<std::vector<unsigned int>> &inshapes,
+                         std::vector<std::vector<unsigned int>> &outshapes) {
+  outshapes[0] = inshapes[0];
   return MX_SUCCESS;
 }
 
@@ -76,17 +59,16 @@ class MyStatefulOp : public CustomStatefulOp {
     return MX_SUCCESS;
   }
 
-  int State() { return count; }
-  ~MyStatefulOp() {}
-
  private:
   std::string subgraph_sym;
-  int count;
 };
 
 MXReturnValue createOpState(std::map<std::string,std::string> attrs,
                             CustomStatefulOp** op_inst) {
   std::string serialized_subgraph = "[empty]";
+
+  // MXNet subgraph is stored as Symbol in operator node attributes subgraphs field
+  // custom subgraph is stored as json string in custom operator attrs map entry
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
     serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
   }
@@ -95,11 +77,10 @@ MXReturnValue createOpState(std::map<std::string,std::string> attrs,
   return MX_SUCCESS;
 }
 
-REGISTER_OP(subgraph_op)
+REGISTER_OP(_custom_subgraph_op)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
 .setInferShape(inferShape)
-.setMutateInputs(mutateInputs)
 .setCreateOpState(createOpState);
 
 MXReturnValue initialize(int version) {
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index ccfe30d8f8ef..fe79ae07cee9 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -25,6 +25,8 @@
 
 import mxnet as mx
 import os
+from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
+import ctypes
 
 # load library
 if (os.name=='posix'):
@@ -34,20 +36,28 @@
     path = os.path.abspath('subgraph_lib.so')
     mx.library.load(path)
 
-# setup inputs to call test operator
-a = mx.nd.array([[1,2],[3,4]])
-b = mx.nd.array([[5,6],[7,8]])
+a = mx.sym.var('a')
+b = mx.sym.var('b')
+c = a + b
+d = mx.sym.exp(c)
+ret = mx.sym.log(d)
 
-# imperative compute and print output
-print(mx.nd.subgraph_op(a,b))
+op_names = ['exp','log']
+out = SymbolHandle()
 
-# symbolic compute
-s = mx.sym.Variable('s')
-t = mx.sym.Variable('t')
-c = mx.sym.subgraph_op(s,t)
-in_grad = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
-#exe = c.bind(ctx=mx.cpu(),args={'s':a},args_grad=in_grad,aux_states={'t':b})
-exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad)
-out = exe.forward()
+check_call(_LIB.MXBuildSubgraphByOpNames(ret.handle,
+                                         c_str('default'),
+                                         mx_uint(len(op_names)),
+                                         c_str_array(op_names),
+                                         ctypes.byref(out)))
+partitioned_sym = mx.sym.Symbol(out)
+json_sym = partitioned_sym.tojson()
+
+mystr = json_sym
+mystr = json_sym.replace("_CachedOp","_custom_subgraph_op")
+
+mysym = mx.sym.load_json(mystr)
+
+exe = mysym.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))})
 out = exe.forward()
 print(out)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 9e9c819b42cb..01de602b1889 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -515,13 +515,13 @@ int MXLoadLib(const char *path) {
       }
 
       // convert subgraph symbol from node attributes to char*
+      std::string subgraph_json;
       if (!attrs.subgraphs.empty()) {
         nnvm::Graph g;
         g.outputs = attrs.subgraphs[0].get()->outputs;
-        const std::string serialized_subgraph = nnvm::pass::SaveJSON(g);
-        const std::string subgraph = SUBGRAPH_SYM_JSON;
-        attr_keys.push_back(subgraph.c_str());
-        attr_vals.push_back(serialized_subgraph.c_str());
+        subgraph_json = nnvm::pass::SaveJSON(g);
+        attr_keys.push_back(SUBGRAPH_SYM_JSON);
+        attr_vals.push_back(subgraph_json.c_str());
       }
 
       // create a pointer to hold custom op state object

From 1686273d880a4ce9a275d9c36f05449c6bb9792a Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Thu, 26 Sep 2019 07:26:51 +0000
Subject: [PATCH 072/111] rm lib_api c++11 dep, rm warpctc, add rm flag

---
 Makefile                        |  9 ++--
 example/lib_ops/Makefile        |  7 +--
 example/lib_ops/test_warpctc.py | 32 --------------
 example/lib_ops/warpctc_lib.cc  | 75 ---------------------------------
 include/mxnet/lib_api.h         | 27 ++++++++----
 5 files changed, 24 insertions(+), 126 deletions(-)
 delete mode 100644 example/lib_ops/test_warpctc.py
 delete mode 100644 example/lib_ops/warpctc_lib.cc

diff --git a/Makefile b/Makefile
index bbc5a74aa3d2..57f1accb071a 100644
--- a/Makefile
+++ b/Makefile
@@ -664,6 +664,7 @@ cpplint:
 pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
+# sample lib for dynamically loading custom operator
 sample_lib:
 	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
@@ -716,10 +717,6 @@ rpkgtest:
 	Rscript -e 'require(testthat);res<-test_dir("R-package/tests/testthat");if(!testthat:::all_passed(res)){stop("Test failures", call. = FALSE)}'
 	Rscript -e 'res<-covr:::package_coverage("R-package");fileConn<-file(paste("r-package_coverage_",toString(runif(1)),".json"));writeLines(covr:::to_codecov(res), fileConn);close(fileConn)'
 
-
-sample_lib:
-	$(CXX) -shared -fPIC example/lib_api/mylib.cc -o libsample_lib.so -I include/mxnet
-
 scalaclean:
 	(cd $(ROOTDIR)/scala-package && mvn clean)
 
@@ -764,7 +761,7 @@ ratcheck: build/rat/apache-rat/target/apache-rat-0.13.jar
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
+	$(RM) -r build lib bin deps libsample_lib.so *~ */*~ */*/*~ */*/*/*~
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
@@ -775,7 +772,7 @@ clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
 clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
+	$(RM) -r build lib bin libsample_lib.so *~ */*~ */*/*~ */*/*/*~
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
index ce130d826a03..2818bf04d7cf 100644
--- a/example/lib_ops/Makefile
+++ b/example/lib_ops/Makefile
@@ -15,14 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-all: warpctc_lib subgraph_lib gemm_lib
+all: subgraph_lib gemm_lib
 
 gemm_lib:
 	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o gemm_lib.so -I ../../include/mxnet
 
-warpctc_lib:
-	g++ -shared -fPIC -std=gnu++0x warpctc_lib.cc -o warpctc_lib.so -I ../../include/mxnet
-
 subgraph_lib:
 	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
 
@@ -33,4 +30,4 @@ win_test:
 	cl libtest.cc
 
 clean:
-	rm -rf *.so libtest
+	rm -rf *.so
diff --git a/example/lib_ops/test_warpctc.py b/example/lib_ops/test_warpctc.py
deleted file mode 100644
index 2ade73b8f9ca..000000000000
--- a/example/lib_ops/test_warpctc.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=arguments-differ
-
-import mxnet as mx
-import os
-
-#load library
-if (os.name=='posix'):
-    path = os.path.abspath('warpctc_lib.so')
-    mx.library.load(path)
-elif (os.name=='nt'):
-    path = os.path.abspath('warpctc_lib.so')
-    mx.library.load(path)
diff --git a/example/lib_ops/warpctc_lib.cc b/example/lib_ops/warpctc_lib.cc
deleted file mode 100644
index 05e19683a272..000000000000
--- a/example/lib_ops/warpctc_lib.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2019 by Contributors
- * \file mylib.cc
- * \brief Sample custom operator implementation
- * library file
- */
-
-#include <iostream>
-#include "lib_api.h"
-
-MXReturnValue forward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
-  return MX_SUCCESS;
-}
-
-
-MXReturnValue backward(std::map<std::string,std::string> attrs,
-               std::vector<MXTensor> inputs, std::vector<MXTensor> outputs,
-               OpResource res) {
-  return MX_SUCCESS;
-}
-
-MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
-               int* num_in, int* num_out) {
-  *num_in = 2;
-  *num_out = 1;
-  return MX_SUCCESS;
-}
-
-MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
-              std::vector<int> &outtypes) {
-  return MX_SUCCESS;
-}
-
-MXReturnValue inferShape(std::map<std::string,std::string> attrs, std::vector<std::vector<unsigned int>> &inshapes,
-               std::vector<std::vector<unsigned int>> &outshapes) {
-  return MX_SUCCESS;
-}
-
-REGISTER_OP(warpctc)
-.setForward(forward)
-.setBackward(backward)
-.setParseAttrs(parseAttrs)
-.setInferType(inferType)
-.setInferShape(inferShape);
-
-MXReturnValue initialize(int version) {
-  if (version >= 10400) {
-    std::cout << "MXNet version " << version << " supported" << std::endl;
-    return MX_SUCCESS;
-  } else {
-    std::cout << "MXNet version " << version << " not supported" << std::endl;
-    return MX_FAIL;
-  }
-}
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 35af7a3b68a0..425e11f16b77 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -29,6 +29,7 @@
 #define MXNET_LIB_API_H_
 
 #include <stdint.h>
+#include <stdlib.h>
 #include <vector>
 #include <map>
 #include <string>
@@ -58,10 +59,10 @@ enum MXReturnValue {
  * \brief External Tensor data structure
  */
 struct MXTensor {
-  MXTensor() : data(nullptr) {}
+  MXTensor() : data(NULL) {}
 
   MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
-  : data{data}, shape{shape}, dtype{dtype} {}
+  : data(data), shape(shape), dtype(dtype) {}
 
   /*!
    * \brief helper function to cast data pointer
@@ -71,9 +72,19 @@ struct MXTensor {
     return reinterpret_cast<data_type*>(data);
   }
 
-  void *data;  // not owned
+  // data is flatten 1D repr of tensor, elements are in continuous memory
+  // user can access each element using the shape of tensor
+  // it may also point to data allocated on gpu
+  void *data;
+
+  // shape is in [2,3,4] format to represent high-dim tensor
   std::vector<int64_t> shape;
+
+  // type can only be MXDType enum types
   MXDType dtype;
+
+  // gpu flag to specify the data tensor storage location
+  bool is_gpu;
 };
 
 /*!
@@ -147,8 +158,8 @@ typedef MXReturnValue (*parseAttrs_t)(std::map<std::string, std::string>,
 typedef MXReturnValue (*inferType_t)(std::map<std::string, std::string>,
                                      std::vector<int>&, std::vector<int>&);
 typedef MXReturnValue (*inferShape_t)(std::map<std::string, std::string>,
-                                      std::vector<std::vector<unsigned int>>&,
-                                      std::vector<std::vector<unsigned int>>&);
+                                      std::vector<std::vector<unsigned int> >&,
+                                      std::vector<std::vector<unsigned int> >&);
 typedef MXReturnValue (*mutateInputs_t)(std::map<std::string, std::string>,
                                       std::vector<int>&);
 typedef MXReturnValue (*createOpState_t)(std::map<std::string, std::string>,
@@ -159,9 +170,9 @@ typedef MXReturnValue (*createOpState_t)(std::map<std::string, std::string>,
  */
 class CustomOp {
  public:
-  explicit CustomOp(const char* op_name) : name(op_name), forward(nullptr),
-    backward(nullptr), parse_attrs(nullptr), infer_type(nullptr), infer_shape(nullptr),
-    mutate_inputs(nullptr), create_opstate(nullptr) {}
+  explicit CustomOp(const char* op_name) : name(op_name), forward(NULL),
+    backward(NULL), parse_attrs(NULL), infer_type(NULL), infer_shape(NULL),
+    mutate_inputs(NULL), create_opstate(NULL) {}
   ~CustomOp() {}
   CustomOp& setForward(fcomp_t fcomp) {
     forward = fcomp;

From 7009ad4863837a5da1258f874e7ef661dd3b979d Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Thu, 26 Sep 2019 07:38:42 +0000
Subject: [PATCH 073/111] fix conflict

---
 Makefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 57f1accb071a..8e210fa25130 100644
--- a/Makefile
+++ b/Makefile
@@ -666,7 +666,7 @@ pylint:
 
 # sample lib for dynamically loading custom operator
 sample_lib:
-	$(CXX) -shared -fPIC -std=c++11 example/lib_ops/gemm_lib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=gnu++0x example/lib_ops/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
 # Cython build
 cython:
@@ -761,23 +761,25 @@ ratcheck: build/rat/apache-rat/target/apache-rat-0.13.jar
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin deps libsample_lib.so *~ */*~ */*/*~ */*/*/*~
+	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(TVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
+	$(RM) libsample_lib.so
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
 clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin libsample_lib.so *~ */*~ */*/*~ */*/*/*~
+	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
+	$(RM) libsample_lib.so
 endif
 
 clean_all: clean

From 4b731790529f0c701018cab97b05b6281420ffc9 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Sat, 28 Sep 2019 00:23:34 +0000
Subject: [PATCH 074/111] subgraph json parsing utility

---
 example/lib_ops/subgraph_lib.cc |  20 ++++-
 include/mxnet/lib_api.h         | 141 ++++++++++++++++++++++++++++++++
 src/c_api/c_api.cc              |   9 ++
 3 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index eb1bfe6f6c57..43a82c28224c 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -29,8 +29,23 @@
 
 MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
                          int* num_in, int* num_out) {
-  *num_in = 1;
-  *num_out = 1;
+  std::string serialized_subgraph;
+  if (attrs.count(SUBGRAPH_SYM_JSON)) {
+    serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
+    //parse string to json
+    json_val val = parse_json(serialized_subgraph);
+    int input = 0;
+    for(auto &item : val.map[json_val("nodes")].list) {
+      if(item.map[json_val("op")].str == "null")
+        input++;
+    }
+    int output = val.map[json_val("heads")].list.size();
+    *num_in = input;
+    *num_out = output;
+  } else {
+    *num_in = 1;
+    *num_out = 1;
+  }
   return MX_SUCCESS;
 }
 
@@ -66,7 +81,6 @@ class MyStatefulOp : public CustomStatefulOp {
 MXReturnValue createOpState(std::map<std::string,std::string> attrs,
                             CustomStatefulOp** op_inst) {
   std::string serialized_subgraph = "[empty]";
-
   // MXNet subgraph is stored as Symbol in operator node attributes subgraphs field
   // custom subgraph is stored as json string in custom operator attrs map entry
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 425e11f16b77..b2701950f46e 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -115,6 +115,147 @@ class OpResource {
  */
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
 
+/*!
+ * \brief Simple Json parser to parse serialized subgraph symbol
+ */
+//Types of JSON objects
+enum json_type {ERR,STR,NUM,LIST,MAP};
+//forward declaration of struct for JSON objects
+struct json_val_t;
+typedef struct json_val_t json_val;
+//definition of struct for JSON objects
+typedef struct json_val_t {
+  json_val_t() : type(ERR),num(-1),str("") {} //default constructor
+  json_val_t(json_type t) : type(t),num(-1),str("") {} //construct a JSON object by type
+  json_val_t(std::string s) : type(STR), num(-1), str(s) {} //construct a string JSON object
+  json_val_t(int n) : type(NUM), num(n), str(std::to_string(n)) {} //construct a number JSON object
+  json_val_t(json_type t, int n, std::string s) : type(t),num(n),str(s) {}  //complex constructor
+  bool operator<(const json_val &o) const {
+    if(type == STR) return type == o.type && str < o.str; //for string JSON objects compare the string
+    if(type == NUM) return type == o.type && num < o.num; ///for number JSON objects compare the number
+    if(type == LIST) { //for list JSON objects, compare the size of the list, and then each object in the lists
+      if(list.size() != o.list.size()) return false;
+      for(int i=0; i< list.size(); i++)	if(list[i] < o.list[i]) return false; //if we find an object that doesnt match return
+      return true; //all objects in lists matched
+    }
+    if(type == MAP) { //for map JSON objects, compare the size of the map, and then each key/value in the maps
+      if(map.size() != o.map.size()) return false;
+      for(auto &item : map) {
+	if(o.map.find(item.first) == o.map.end()) return false; //if one map is missing a key in another return
+	if(item.second < o.map.at(item.first)) return false;
+      }
+      return true;
+    }
+    return type < o.type;
+  }
+  std::string str;
+  int num;
+  std::vector<json_val> list;
+  std::map<json_val, json_val> map;
+  json_type type;
+} json_val;
+//forward declaration of generic parse function
+json_val parse(std::string json, int *idx);
+//debug function to convert a JSON object to a string
+std::string json_val_string(const json_val &val) {
+  std::string ret;
+  switch(val.type) {
+  case ERR:
+    ret = "json(Error)";
+    break;
+  case STR:
+    ret = "json(STR:" + val.str + ")";
+    break;
+  case NUM:
+    ret = "json(INT:" + val.str + ")";
+    break;
+  case LIST:
+    ret = "json(LIST:[";
+    for(auto &item : val.list)
+      ret += json_val_string(item) + ",";
+    ret += "])";
+    break;
+  case MAP:
+    ret = "json(MAP:{";
+    for(auto &item : val.map)
+      ret += json_val_string(item.first) + " : " + json_val_string(item.second) + ",";
+    ret += "})";
+    break;
+  }
+  return ret;
+}
+//debug function to print a JSON object
+void print_json_val(json_val val) {
+  std::cout << json_val_string(val) << std::endl;
+}
+//parse a string JSON object
+json_val parse_string(std::string json, int* idx) {
+  json_val ret(STR);
+  while(*idx < json.size()) {
+    if(json[*idx] == '"') {++(*idx); return ret;
+    } else {ret.str += json[*idx]; ++(*idx);}
+  }
+  std::cout << "Error! Unable to parse string" << std::endl;
+  return json_val();
+}
+//parse a number JSON object
+json_val parse_num(std::string json, int* idx) {
+  json_val ret(NUM);
+  while(*idx < json.size()) {
+    if(json[*idx] >= '0' && json[*idx] <= '9') {ret.str += json[*idx]; ++(*idx);
+    } else break;
+  }
+  ret.num = std::stoi(ret.str);
+  return ret;
+}
+//parse a list of JSON objects
+json_val parse_list(std::string json, int* idx) {
+  json_val ret(LIST);
+  while(*idx < json.size()) {
+    if(json[*idx] == ']') {++(*idx); return ret;
+    } else {
+      json_val item = parse(json,idx);
+      if(item.type != ERR)
+	ret.list.push_back(item);
+    }
+  }
+  std::cout << "Error! Unable to parse list" << std::endl;
+  return json_val();
+}
+//parse a map of JSON objects
+json_val parse_map(std::string json, int* idx) {
+  json_val ret(MAP),key;
+  while(*idx < json.size()) {
+    if(json[*idx] == '}') { ++(*idx); return ret;
+    } else {
+      json_val item = parse(json,idx);
+      if(key.type == ERR) key = item;
+      else {ret.map[key]=item; key.type = ERR;}
+    }
+  }
+  std::cout << "Error! Unable to parse map" << std::endl;
+  return json_val();
+}
+//generic parse function
+json_val parse(std::string json, int *idx) {
+  json_val ret;
+  while(*idx < json.size()) {
+    if(json[*idx] == '"') {++(*idx); ret = parse_string(json,idx);
+    } else if(json[*idx] >= '0' && json[*idx] <= '9') {ret = parse_num(json,idx);
+    } else if(json[*idx] == '[') {++(*idx); ret = parse_list(json,idx);
+    } else if(json[*idx] == '{') {++(*idx); ret = parse_map(json,idx);
+    } else if(json[*idx] == ']' || json[*idx] == '}') {return ret;}
+    if(ret.type != ERR) return ret;
+    else ++(*idx);
+  }
+  return ret;
+}
+// Main entry point to parse a string to JSON
+json_val parse_json(std::string json) {
+  int idx=0;
+  return parse(json,&idx);
+}
+
 /*!
  * \brief An abstract class for library author creating stateful op
  * custom library should override Forward and destructor, and has an
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 01de602b1889..b9b6967104c6 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -191,6 +191,15 @@ int MXLoadLib(const char *path) {
         attr_keys.push_back(kv.first.c_str());
         attr_vals.push_back(kv.second.c_str());
       }
+      // convert subgraph symbol from node attributes to char*
+      std::string subgraph_json;
+      if (!attrs->subgraphs.empty()) {
+        nnvm::Graph g;
+        g.outputs = attrs->subgraphs[0].get()->outputs;
+        subgraph_json = nnvm::pass::SaveJSON(g);
+        attr_keys.push_back(SUBGRAPH_SYM_JSON);
+        attr_vals.push_back(subgraph_json.c_str());
+      }
 
       int num_in = -1;
       int num_out = -1;

From dca521e225734cc743aaf26a31e280a5e8df2c96 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Sun, 29 Sep 2019 23:20:14 +0000
Subject: [PATCH 075/111] add data size and fix unsigned warnings

---
 example/lib_ops/subgraph_lib.cc |  5 +++++
 include/mxnet/lib_api.h         | 30 +++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 43a82c28224c..3f3c167c54df 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -71,6 +71,11 @@ class MyStatefulOp : public CustomStatefulOp {
                         std::vector<MXTensor> outputs,
                         OpResource op_res) {
     std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
+    float* in_data = inputs[0].getData<float>();
+    float* out_data = outputs[0].getData<float>();
+    for (int i = 0; i < inputs[0].getDataSize(); i++) {
+      out_data[i] = in_data[i];
+    }
     return MX_SUCCESS;
   }
 
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b2701950f46e..e02d4efec023 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -68,10 +68,21 @@ struct MXTensor {
    * \brief helper function to cast data pointer
    */
   template<typename data_type>
-  data_type* getData() {
+  inline data_type* getData() {
     return reinterpret_cast<data_type*>(data);
   }
 
+  /*!
+   * \brief helper function to get data size
+   */
+  inline int64_t getDataSize() {
+    int64_t size = 1;
+    for (unsigned int i = 0; i < shape.size(); i++) {
+      size *= shape[i];
+    }
+    return size;
+  }
+
   // data is flatten 1D repr of tensor, elements are in continuous memory
   // user can access each element using the shape of tensor
   // it may also point to data allocated on gpu
@@ -118,6 +129,7 @@ class OpResource {
 /*!
  * \brief Simple Json parser to parse serialized subgraph symbol
  */
+
 //Types of JSON objects
 enum json_type {ERR,STR,NUM,LIST,MAP};
 //forward declaration of struct for JSON objects
@@ -135,7 +147,7 @@ typedef struct json_val_t {
     if(type == NUM) return type == o.type && num < o.num; ///for number JSON objects compare the number
     if(type == LIST) { //for list JSON objects, compare the size of the list, and then each object in the lists
       if(list.size() != o.list.size()) return false;
-      for(int i=0; i< list.size(); i++)	if(list[i] < o.list[i]) return false; //if we find an object that doesnt match return
+      for(unsigned int i=0; i< list.size(); i++)	if(list[i] < o.list[i]) return false; //if we find an object that doesnt match return
       return true; //all objects in lists matched
     }
     if(type == MAP) { //for map JSON objects, compare the size of the map, and then each key/value in the maps
@@ -155,7 +167,7 @@ typedef struct json_val_t {
   json_type type;
 } json_val;
 //forward declaration of generic parse function
-json_val parse(std::string json, int *idx);
+json_val parse(std::string json, unsigned int *idx);
 //debug function to convert a JSON object to a string
 std::string json_val_string(const json_val &val) {
   std::string ret;
@@ -189,7 +201,7 @@ void print_json_val(json_val val) {
   std::cout << json_val_string(val) << std::endl;
 }
 //parse a string JSON object
-json_val parse_string(std::string json, int* idx) {
+json_val parse_string(std::string json, unsigned int* idx) {
   json_val ret(STR);
   while(*idx < json.size()) {
     if(json[*idx] == '"') {++(*idx); return ret;
@@ -199,7 +211,7 @@ json_val parse_string(std::string json, int* idx) {
   return json_val();
 }
 //parse a number JSON object
-json_val parse_num(std::string json, int* idx) {
+json_val parse_num(std::string json, unsigned int* idx) {
   json_val ret(NUM);
   while(*idx < json.size()) {
     if(json[*idx] >= '0' && json[*idx] <= '9') {ret.str += json[*idx]; ++(*idx);
@@ -209,7 +221,7 @@ json_val parse_num(std::string json, int* idx) {
   return ret;
 }
 //parse a list of JSON objects
-json_val parse_list(std::string json, int* idx) {
+json_val parse_list(std::string json, unsigned int* idx) {
   json_val ret(LIST);
   while(*idx < json.size()) {
     if(json[*idx] == ']') {++(*idx); return ret;
@@ -223,7 +235,7 @@ json_val parse_list(std::string json, int* idx) {
   return json_val();
 }
 //parse a map of JSON objects
-json_val parse_map(std::string json, int* idx) {
+json_val parse_map(std::string json, unsigned int* idx) {
   json_val ret(MAP),key;
   while(*idx < json.size()) {
     if(json[*idx] == '}') { ++(*idx); return ret;
@@ -237,7 +249,7 @@ json_val parse_map(std::string json, int* idx) {
   return json_val();
 }
 //generic parse function
-json_val parse(std::string json, int *idx) {
+json_val parse(std::string json, unsigned int *idx) {
   json_val ret;
   while(*idx < json.size()) {
     if(json[*idx] == '"') {++(*idx); ret = parse_string(json,idx);
@@ -252,7 +264,7 @@ json_val parse(std::string json, int *idx) {
 }
 // Main entry point to parse a string to JSON
 json_val parse_json(std::string json) {
-  int idx=0;
+  unsigned int idx=0;
   return parse(json,&idx);
 }
 

From baed04ec0531851a9c16054034a2dfc640293b23 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Mon, 30 Sep 2019 03:29:36 +0000
Subject: [PATCH 076/111] use c++ struct and fix cpplint

---
 example/lib_ops/gemm_lib.cc     |  55 ++++---
 example/lib_ops/subgraph_lib.cc |  28 ++--
 include/mxnet/lib_api.h         | 283 +++++++++++++++++---------------
 3 files changed, 193 insertions(+), 173 deletions(-)

diff --git a/example/lib_ops/gemm_lib.cc b/example/lib_ops/gemm_lib.cc
index 98d67ebd0437..98301cd80acb 100644
--- a/example/lib_ops/gemm_lib.cc
+++ b/example/lib_ops/gemm_lib.cc
@@ -31,11 +31,11 @@
  * main matrix multiplication routine
  */
 void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
-  unsigned i,j,kk;
-  for (i=0;i<n;i++) {
-    for (j=0;j<m;j++) {
+  unsigned i, j, kk;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
       C[i*m+j] = 0;
-      for (kk=0;kk<k;kk++) {
+      for (kk = 0; kk < k; kk++) {
         C[i*m+j] += A[i*k+kk] * B[kk*m+j];
       }
     }
@@ -43,7 +43,7 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 }
 
 void transpose(float* A, float* At, unsigned n, unsigned m) {
-  unsigned i,j;
+  unsigned i, j;
   for (i=0; i < n; i++) {
     for (j=0; j < m; j++) {
       At[i*n+j] = A[j*m+i];
@@ -57,24 +57,24 @@ void transpose(float* A, float* At, unsigned n, unsigned m) {
  * inputs[1] = B
  * outputs[0] = C
  */
-MXReturnValue forward(std::map<std::string,std::string> attrs,
+MXReturnValue forward(std::map<std::string, std::string> attrs,
                       std::vector<MXTensor> inputs,
                       std::vector<MXTensor> outputs,
                       OpResource res) {
-  //validate inputs
-  for(int i=0; i<inputs.size(); i++) {
-    if(inputs[i].dtype != kFloat32) {
+  // validate inputs
+  for (unsigned i = 0; i < inputs.size(); i++) {
+    if (inputs[i].dtype != kFloat32) {
       std::cout << "Expected input " << i << " to have float32 type" << std::endl;
       return MX_FAIL;
     }
   }
 
-  //extract data pointers from tensors
+  // extract data pointers from tensors
   float* A = inputs[0].getData<float>();
   float* B = inputs[1].getData<float>();
   float* C = outputs[0].getData<float>();
 
-  //set tensor shapes
+  // set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
   unsigned m = inputs[1].shape[1];
@@ -98,25 +98,25 @@ MXReturnValue forward(std::map<std::string,std::string> attrs,
  * outputs[0] = dA
  * outputs[1] = dB
  */
-MXReturnValue backward(std::map<std::string,std::string> attrs,
+MXReturnValue backward(std::map<std::string, std::string> attrs,
                        std::vector<MXTensor> inputs,
                        std::vector<MXTensor> outputs,
                        OpResource res) {
-  //validate inputs
-  for(int i=0; i<inputs.size(); i++) {
-    if(inputs[i].dtype != kFloat32) {
+  // validate inputs
+  for (unsigned i = 0; i < inputs.size(); i++) {
+    if (inputs[i].dtype != kFloat32) {
       std::cout << "Expected input " << i << " to have float32 type" << std::endl;
       return MX_FAIL;
     }
   }
 
-  //extract data pointers from tensors
+  // extract data pointers from tensors
   float* dC = inputs[0].getData<float>();
   float* A = inputs[1].getData<float>();
   float* B = inputs[2].getData<float>();
   float* dA = outputs[0].getData<float>();
   float* dB = outputs[1].getData<float>();
-  //set tensor shapes
+  // set tensor shapes
   unsigned n = inputs[1].shape[0];
   unsigned k = inputs[1].shape[1];
   unsigned m = inputs[2].shape[1];
@@ -133,13 +133,14 @@ MXReturnValue backward(std::map<std::string,std::string> attrs,
   return MX_SUCCESS;
 }
 
-MXReturnValue parseAttrs(std::map<std::string,std::string> attrs, int* num_in, int* num_out) {
+MXReturnValue parseAttrs(std::map<std::string, std::string> attrs, int* num_in, int* num_out) {
   *num_in = 2;
   *num_out = 1;
   return MX_SUCCESS;
 }
 
-MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int> &intypes,
+MXReturnValue inferType(std::map<std::string, std::string> attrs,
+                        std::vector<int> &intypes,
                         std::vector<int> &outtypes) {
   // validate inputs
   if (intypes.size() != 2) {
@@ -160,7 +161,7 @@ MXReturnValue inferType(std::map<std::string,std::string> attrs, std::vector<int
   return MX_SUCCESS;
 }
 
-MXReturnValue inferShape(std::map<std::string,std::string> attrs,
+MXReturnValue inferShape(std::map<std::string, std::string> attrs,
                          std::vector<std::vector<unsigned int>> &inshapes,
                          std::vector<std::vector<unsigned int>> &outshapes) {
   // validate inputs
@@ -206,10 +207,10 @@ REGISTER_OP(my_gemm)
 
 /* ------------------------------------------------------------------------- */
 
-MXReturnValue mutateInputs(std::map<std::string,std::string> attrs,
-               std::vector<int> &input_indices) {
-  //input_indices.push_back(1);
-  //std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+MXReturnValue mutateInputs(std::map<std::string, std::string> attrs,
+                           std::vector<int> &input_indices) {
+  // input_indices.push_back(1);
+  // std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
   return MX_SUCCESS;
 }
 
@@ -225,14 +226,14 @@ class MyStatefulGemm : public CustomStatefulOp {
     *p = count;
     std::cout << "test op resource " << *p << std::endl;
 
-    std::map<std::string,std::string> attrs;
+    std::map<std::string, std::string> attrs;
     return forward(attrs, inputs, outputs, op_res);
   }
 
   MXReturnValue Backward(std::vector<MXTensor> inputs,
                std::vector<MXTensor> outputs,
                OpResource op_res) {
-    std::map<std::string,std::string> attrs;
+    std::map<std::string, std::string> attrs;
     return backward(attrs, inputs, outputs, op_res);
   }
 
@@ -242,7 +243,7 @@ class MyStatefulGemm : public CustomStatefulOp {
   int count;
 };
 
-MXReturnValue createOpState(std::map<std::string,std::string> attrs,
+MXReturnValue createOpState(std::map<std::string, std::string> attrs,
                             CustomStatefulOp** op_inst) {
   *op_inst = new MyStatefulGemm(58);
   std::cout << "create op state successful" << std::endl;
diff --git a/example/lib_ops/subgraph_lib.cc b/example/lib_ops/subgraph_lib.cc
index 3f3c167c54df..4e263516e0d4 100644
--- a/example/lib_ops/subgraph_lib.cc
+++ b/example/lib_ops/subgraph_lib.cc
@@ -27,36 +27,34 @@
 #include <iostream>
 #include "lib_api.h"
 
-MXReturnValue parseAttrs(std::map<std::string,std::string> attrs,
+MXReturnValue parseAttrs(std::map<std::string, std::string> attrs,
                          int* num_in, int* num_out) {
-  std::string serialized_subgraph;
+  *num_in = 1;
+  *num_out = 1;
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
-    serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
-    //parse string to json
-    json_val val = parse_json(serialized_subgraph);
+    std::string serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
+    Json_Parser jp;
+    json_val val = jp.parse_to_json(serialized_subgraph);
     int input = 0;
-    for(auto &item : val.map[json_val("nodes")].list) {
-      if(item.map[json_val("op")].str == "null")
+    for (auto &item : val.map[json_val("nodes")].list) {
+      if (item.map[json_val("op")].str == "null")
         input++;
     }
     int output = val.map[json_val("heads")].list.size();
     *num_in = input;
     *num_out = output;
-  } else {
-    *num_in = 1;
-    *num_out = 1;
   }
   return MX_SUCCESS;
 }
 
-MXReturnValue inferType(std::map<std::string,std::string> attrs,
+MXReturnValue inferType(std::map<std::string, std::string> attrs,
                         std::vector<int> &intypes,
                         std::vector<int> &outtypes) {
   outtypes[0] = intypes[0];
   return MX_SUCCESS;
 }
 
-MXReturnValue inferShape(std::map<std::string,std::string> attrs,
+MXReturnValue inferShape(std::map<std::string, std::string> attrs,
                          std::vector<std::vector<unsigned int>> &inshapes,
                          std::vector<std::vector<unsigned int>> &outshapes) {
   outshapes[0] = inshapes[0];
@@ -65,7 +63,7 @@ MXReturnValue inferShape(std::map<std::string,std::string> attrs,
 
 class MyStatefulOp : public CustomStatefulOp {
  public:
-  explicit MyStatefulOp(std::string sym) : subgraph_sym(sym){}
+  explicit MyStatefulOp(std::string sym) : subgraph_sym(sym) {}
 
   MXReturnValue Forward(std::vector<MXTensor> inputs,
                         std::vector<MXTensor> outputs,
@@ -83,10 +81,10 @@ class MyStatefulOp : public CustomStatefulOp {
   std::string subgraph_sym;
 };
 
-MXReturnValue createOpState(std::map<std::string,std::string> attrs,
+MXReturnValue createOpState(std::map<std::string, std::string> attrs,
                             CustomStatefulOp** op_inst) {
   std::string serialized_subgraph = "[empty]";
-  // MXNet subgraph is stored as Symbol in operator node attributes subgraphs field
+  // MXNet subgraph is stored as Symbol in operator node attrs subgraphs field
   // custom subgraph is stored as json string in custom operator attrs map entry
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
     serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index e02d4efec023..8e41e9623a14 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -34,6 +34,7 @@
 #include <map>
 #include <string>
 #include <iostream>
+#include <utility>
 
 #define MX_LIBRARY_VERSION 1
 
@@ -122,151 +123,171 @@ class OpResource {
 };
 
 /*!
- * \brief Macro to help passing serialized subgraph through attribute dict
+ * \brief Json utility to parse serialized subgraph symbol
  */
+// Macro to help passing serialized subgraph through attribute dict
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
 
-/*!
- * \brief Simple Json parser to parse serialized subgraph symbol
- */
+// Types of JSON objects
+enum json_type {ERR, STR, NUM, LIST, MAP};
 
-//Types of JSON objects
-enum json_type {ERR,STR,NUM,LIST,MAP};
-//forward declaration of struct for JSON objects
-struct json_val_t;
-typedef struct json_val_t json_val;
-//definition of struct for JSON objects
-typedef struct json_val_t {
-  json_val_t() : type(ERR),num(-1),str("") {} //default constructor
-  json_val_t(json_type t) : type(t),num(-1),str("") {} //construct a JSON object by type
-  json_val_t(std::string s) : type(STR), num(-1), str(s) {} //construct a string JSON object
-  json_val_t(int n) : type(NUM), num(n), str(std::to_string(n)) {} //construct a number JSON object
-  json_val_t(json_type t, int n, std::string s) : type(t),num(n),str(s) {}  //complex constructor
+// definition of struct for JSON objects
+struct json_val {
+  json_val() : type(ERR), num(-1), str("") {}  // default constructor
+  explicit json_val(json_type t) : type(t), num(-1), str("") {}  // construct a JSON object by type
+  explicit json_val(std::string s) : type(STR), num(-1), str(s) {}  // construct a string JSON object
+  explicit json_val(int n) : type(NUM), num(n), str(std::to_string(n)) {}  // construct a number JSON object
+  json_val(json_type t, int n, std::string s) : type(t), num(n), str(s) {}  // complex constructor
   bool operator<(const json_val &o) const {
-    if(type == STR) return type == o.type && str < o.str; //for string JSON objects compare the string
-    if(type == NUM) return type == o.type && num < o.num; ///for number JSON objects compare the number
-    if(type == LIST) { //for list JSON objects, compare the size of the list, and then each object in the lists
-      if(list.size() != o.list.size()) return false;
-      for(unsigned int i=0; i< list.size(); i++)	if(list[i] < o.list[i]) return false; //if we find an object that doesnt match return
-      return true; //all objects in lists matched
+    if (type == STR) return type == o.type && str < o.str;  // for string JSON objects compare the string
+    if (type == NUM) return type == o.type && num < o.num;  // for number JSON objects compare the number
+    if (type == LIST) {  // for list JSON objects, compare the size of the list, and then each object in the lists
+      if (list.size() != o.list.size()) return false;
+      for (unsigned int i=0; i< list.size(); i++)
+        if (list[i] < o.list[i])
+          return false;  // if we find an object that doesnt match return
+      return true;  // all objects in lists matched
     }
-    if(type == MAP) { //for map JSON objects, compare the size of the map, and then each key/value in the maps
-      if(map.size() != o.map.size()) return false;
-      for(auto &item : map) {
-	if(o.map.find(item.first) == o.map.end()) return false; //if one map is missing a key in another return
-	if(item.second < o.map.at(item.first)) return false;
+    if (type == MAP) {  // for map JSON objects, compare the size of the map, and then each key/value in the maps
+      if (map.size() != o.map.size()) return false;
+      for (auto &item : map) {
+        if (o.map.find(item.first) == o.map.end()) return false;  // if one map is missing a key in another return
+        if (item.second < o.map.at(item.first)) return false;
       }
       return true;
     }
     return type < o.type;
   }
-  std::string str;
+  json_type type;
   int num;
+  std::string str;
   std::vector<json_val> list;
   std::map<json_val, json_val> map;
-  json_type type;
-} json_val;
-//forward declaration of generic parse function
-json_val parse(std::string json, unsigned int *idx);
-//debug function to convert a JSON object to a string
-std::string json_val_string(const json_val &val) {
-  std::string ret;
-  switch(val.type) {
-  case ERR:
-    ret = "json(Error)";
-    break;
-  case STR:
-    ret = "json(STR:" + val.str + ")";
-    break;
-  case NUM:
-    ret = "json(INT:" + val.str + ")";
-    break;
-  case LIST:
-    ret = "json(LIST:[";
-    for(auto &item : val.list)
-      ret += json_val_string(item) + ",";
-    ret += "])";
-    break;
-  case MAP:
-    ret = "json(MAP:{";
-    for(auto &item : val.map)
-      ret += json_val_string(item.first) + " : " + json_val_string(item.second) + ",";
-    ret += "})";
-    break;
-  }
-  return ret;
-}
-//debug function to print a JSON object
-void print_json_val(json_val val) {
-  std::cout << json_val_string(val) << std::endl;
-}
-//parse a string JSON object
-json_val parse_string(std::string json, unsigned int* idx) {
-  json_val ret(STR);
-  while(*idx < json.size()) {
-    if(json[*idx] == '"') {++(*idx); return ret;
-    } else {ret.str += json[*idx]; ++(*idx);}
-  }
-  std::cout << "Error! Unable to parse string" << std::endl;
-  return json_val();
-}
-//parse a number JSON object
-json_val parse_num(std::string json, unsigned int* idx) {
-  json_val ret(NUM);
-  while(*idx < json.size()) {
-    if(json[*idx] >= '0' && json[*idx] <= '9') {ret.str += json[*idx]; ++(*idx);
-    } else break;
-  }
-  ret.num = std::stoi(ret.str);
-  return ret;
-}
-//parse a list of JSON objects
-json_val parse_list(std::string json, unsigned int* idx) {
-  json_val ret(LIST);
-  while(*idx < json.size()) {
-    if(json[*idx] == ']') {++(*idx); return ret;
-    } else {
-      json_val item = parse(json,idx);
-      if(item.type != ERR)
-	ret.list.push_back(item);
+};
+
+struct Json_Parser {
+  json_val parse_to_json(std::string json) {
+    unsigned int idx = 0;
+    return parse(json, &idx);
+  }
+  void print_json_val(json_val val) {
+    std::cout << json_val_string(val) << std::endl;
+  }
+  // debug function to convert a JSON object to a string
+  std::string json_val_string(const json_val &val) {
+    std::string ret;
+    switch (val.type) {
+    case ERR:
+      ret = "json(Error)";
+      break;
+    case STR:
+      ret = "json(STR:" + val.str + ")";
+      break;
+    case NUM:
+      ret = "json(INT:" + val.str + ")";
+      break;
+    case LIST:
+      ret = "json(LIST:[";
+      for (auto &item : val.list)
+        ret += json_val_string(item) + ",";
+      ret += "])";
+      break;
+    case MAP:
+      ret = "json(MAP:{";
+      for (auto &item : val.map)
+        ret += json_val_string(item.first) + " : " + json_val_string(item.second) + ",";
+      ret += "})";
+      break;
     }
-  }
-  std::cout << "Error! Unable to parse list" << std::endl;
-  return json_val();
-}
-//parse a map of JSON objects
-json_val parse_map(std::string json, unsigned int* idx) {
-  json_val ret(MAP),key;
-  while(*idx < json.size()) {
-    if(json[*idx] == '}') { ++(*idx); return ret;
-    } else {
-      json_val item = parse(json,idx);
-      if(key.type == ERR) key = item;
-      else {ret.map[key]=item; key.type = ERR;}
+    return ret;
+  }
+  // parse a string JSON object
+  json_val parse_string(std::string json, unsigned int* idx) {
+    json_val ret(STR);
+    while (*idx < json.size()) {
+      if (json[*idx] == '"') {
+        ++(*idx);
+        return ret;
+      } else {
+        ret.str += json[*idx];
+        ++(*idx);
+      }
+    }
+    std::cout << "Error! Unable to parse string" << std::endl;
+    return json_val();
+  }
+  // parse a number JSON object
+  json_val parse_num(std::string json, unsigned int* idx) {
+    json_val ret(NUM);
+    while (*idx < json.size()) {
+      if (json[*idx] >= '0' && json[*idx] <= '9') {
+        ret.str += json[*idx];
+        ++(*idx);
+      } else {
+        break;
+      }
     }
+    ret.num = std::stoi(ret.str);
+    return ret;
+  }
+  // parse a list of JSON objects
+  json_val parse_list(std::string json, unsigned int* idx) {
+    json_val ret(LIST);
+    while (*idx < json.size()) {
+      if (json[*idx] == ']') {
+        ++(*idx);
+        return ret;
+      } else {
+        json_val item = parse(json, idx);
+        if (item.type != ERR)
+          ret.list.push_back(item);
+      }
+    }
+    std::cout << "Error! Unable to parse list" << std::endl;
+    return json_val();
+  }
+  // parse a map of JSON objects
+  json_val parse_map(std::string json, unsigned int* idx) {
+    json_val ret(MAP), key;
+    while (*idx < json.size()) {
+      if (json[*idx] == '}') {
+        ++(*idx);
+        return ret;
+      } else {
+        json_val item = parse(json, idx);
+        if (key.type == ERR) {
+          key = item;
+        } else {
+          ret.map[key] = item;
+          key.type = ERR;
+        }
+      }
+    }
+    std::cout << "Error! Unable to parse map" << std::endl;
+    return json_val();
+  }
+  // generic parse function
+  json_val parse(std::string json, unsigned int *idx) {
+    json_val ret;
+    while (*idx < json.size()) {
+      if (json[*idx] == '"') {
+        ++(*idx);
+        ret = parse_string(json, idx);
+      } else if (json[*idx] >= '0' && json[*idx] <= '9') {
+        ret = parse_num(json, idx);
+      } else if (json[*idx] == '[') {
+        ++(*idx);
+        ret = parse_list(json, idx);
+      } else if (json[*idx] == '{') {
+        ++(*idx);
+        ret = parse_map(json, idx);
+      } else if (json[*idx] == ']' || json[*idx] == '}') {return ret;}
+      if (ret.type != ERR) return ret;
+      ++(*idx);
+    }
+    return ret;
   }
-  std::cout << "Error! Unable to parse map" << std::endl;
-  return json_val();
-}
-//generic parse function
-json_val parse(std::string json, unsigned int *idx) {
-  json_val ret;
-  while(*idx < json.size()) {
-    if(json[*idx] == '"') {++(*idx); ret = parse_string(json,idx);
-    } else if(json[*idx] >= '0' && json[*idx] <= '9') {ret = parse_num(json,idx);
-    } else if(json[*idx] == '[') {++(*idx); ret = parse_list(json,idx);
-    } else if(json[*idx] == '{') {++(*idx); ret = parse_map(json,idx);
-    } else if(json[*idx] == ']' || json[*idx] == '}') {return ret;}
-    if(ret.type != ERR) return ret;
-    else ++(*idx);
-  }
-  return ret;
-}
-// Main entry point to parse a string to JSON
-json_val parse_json(std::string json) {
-  unsigned int idx=0;
-  return parse(json,&idx);
-}
+};
 
 /*!
  * \brief An abstract class for library author creating stateful op
@@ -323,9 +344,9 @@ typedef MXReturnValue (*createOpState_t)(std::map<std::string, std::string>,
  */
 class CustomOp {
  public:
-  explicit CustomOp(const char* op_name) : name(op_name), forward(NULL),
-    backward(NULL), parse_attrs(NULL), infer_type(NULL), infer_shape(NULL),
-    mutate_inputs(NULL), create_opstate(NULL) {}
+  explicit CustomOp(const char* op_name) : name(op_name),
+    forward(NULL), backward(NULL), parse_attrs(NULL), infer_type(NULL),
+    infer_shape(NULL), mutate_inputs(NULL), create_opstate(NULL) {}
   ~CustomOp() {}
   CustomOp& setForward(fcomp_t fcomp) {
     forward = fcomp;

From aedcf91bb39bed4bf07163b29f4864b6d3b75c30 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Mon, 30 Sep 2019 04:48:00 +0000
Subject: [PATCH 077/111] refactor op registry

---
 src/c_api/c_api.cc | 102 +++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 69 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b9b6967104c6..8cc5da7f3ef4 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -152,11 +152,11 @@ int MXLoadLib(const char *path) {
     const char* name;
     // function pointers holding implementation from custom library
     fcomp_t fcomp_fp = nullptr;
-    fcomp_t fgrad_fp = nullptr;
     parseAttrs_t parse_fp = nullptr;
     inferType_t type_fp = nullptr;
     inferShape_t shape_fp = nullptr;
     // optional attributes
+    fcomp_t fgrad_fp = nullptr;
     mutateInputs_t mutate_fp = nullptr;
     createOpState_t create_opstate_fp = nullptr;
 
@@ -634,84 +634,48 @@ int MXLoadLib(const char *path) {
     regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
-    if (regOpPtr == nullptr) {
-      // re-register op in MXNet using lambda converter functions
-      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type);
-      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape);
-      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-      regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
-
-      if (create_opstate_fp != nullptr) {
-        regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate);
-        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward);
-      } else {
-        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda);
-      }
-
-      // optionally add fmutate inputs if user specified a function
-      if (mutate_fp != nullptr)
-        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs);
-      // optionally add fgradient if user specified a function
-      if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
-        regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg);
-        std::string grad_name = "_backward_" + name_str;
-        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
-        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true);
-        gradOp.set_attr_parser(attr_parser);
-        gradOp.set_num_inputs(num_inouts);
-        gradOp.set_num_outputs(num_inputs);
-        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type);
-        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req);
-        if (create_opstate_fp != nullptr) {
-          gradOp.set_attr<bool>("TIsLayerOpBackward", true);
-          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward);
-        } else {
-          gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda);
-        }
-      }
-    } else {
+    int priority = 10;
+    if (regOpPtr != nullptr) {
       // overwrite registration of existing op with custom op
       regOp.arguments.clear();
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
-      regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, 11);
-      regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, 11);
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
-      regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
-      regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
-
+      priority++;
+    }
+    regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, priority);
+    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, priority);
+    regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, priority);
+    regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, priority);
+    // optionally add stateful forward
+    if (create_opstate_fp != nullptr) {
+      regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, priority);
+      regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward, priority);
+    } else {
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, priority);
+    }
+    // optionally add fmutate inputs if user specified a function
+    if (mutate_fp != nullptr)
+      regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, priority);
+    // optionally add fgradient if user specified a function
+    if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
+      regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, priority);
+      std::string grad_name = "_backward_" + name_str;
+      nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+      gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, priority);
+      gradOp.set_attr_parser(attr_parser);
+      gradOp.set_num_inputs(num_inouts);
+      gradOp.set_num_outputs(num_inputs);
+      gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, priority);
+      gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, priority);
       if (create_opstate_fp != nullptr) {
-        regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, 11);
-        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward, 11);
+        gradOp.set_attr<bool>("TIsLayerOpBackward", true, priority);
+        gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward, priority);
       } else {
-        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, 11);
-      }
-
-      // optionally add fmutate inputs if user specified a function
-      if (mutate_fp != nullptr)
-        regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, 11);
-      // optionally add fgradient if user specified a function
-      if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
-        regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, 11);
-        std::string grad_name = "_backward_" + name_str;
-        nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
-        gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, 11);
-        gradOp.set_attr_parser(attr_parser);
-        gradOp.set_num_inputs(num_inouts);
-        gradOp.set_num_outputs(num_inputs);
-        gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, 11);
-        gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, 11);
-        if (create_opstate_fp != nullptr) {
-          gradOp.set_attr<bool>("TIsLayerOpBackward", true, 11);
-          gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward, 11);
-        } else {
-          gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda, 11);
-        }
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda, priority);
       }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");
   }
-
   API_END();
 }
 

From 75102a3e2a7558c7ea72f88b7d6fadb6e5e690ab Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Mon, 30 Sep 2019 19:01:15 +0000
Subject: [PATCH 078/111] fix line length and win array of ci; condense lines

---
 example/lib_ops/gemm_lib.cc |  66 +++++++++-----------
 include/mxnet/lib_api.h     | 117 ++++++++++++++----------------------
 src/c_api/c_api.cc          |  36 +++++------
 3 files changed, 91 insertions(+), 128 deletions(-)

diff --git a/example/lib_ops/gemm_lib.cc b/example/lib_ops/gemm_lib.cc
index 98301cd80acb..fda6962d7e0f 100644
--- a/example/lib_ops/gemm_lib.cc
+++ b/example/lib_ops/gemm_lib.cc
@@ -19,17 +19,14 @@
 
 /*!
  * Copyright (c) 2019 by Contributors
- * \file mylib.cc
- * \brief Sample custom operator implementation
- * library file
+ * \file gemm_lib.cc
+ * \brief Sample custom operator implementation library file
  */
 
 #include <iostream>
 #include "lib_api.h"
 
-/*
- * main matrix multiplication routine
- */
+// main matrix multiplication routine
 void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   unsigned i, j, kk;
   for (i = 0; i < n; i++) {
@@ -53,9 +50,7 @@ void transpose(float* A, float* At, unsigned n, unsigned m) {
 
 /*
  * Executes C = A * B
- * inputs[0] = A
- * inputs[1] = B
- * outputs[0] = C
+ * inputs[0] = A; inputs[1] = B; outputs[0] = C
  */
 MXReturnValue forward(std::map<std::string, std::string> attrs,
                       std::vector<MXTensor> inputs,
@@ -73,7 +68,6 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
   float* A = inputs[0].getData<float>();
   float* B = inputs[1].getData<float>();
   float* C = outputs[0].getData<float>();
-
   // set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
@@ -85,18 +79,15 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
 }
 
 /*
- * Executes dA = dC * B.T
- * Executes dB = A.T * dC
+ * Executes dA = dC * B.T; Executes dB = A.T * dC
  ***** gradient inputs
  * inputs[0] = dC
  ***** original inputs
- * inputs[1] = A
- * inputs[2] = B
+ * inputs[1] = A; inputs[2] = B
  ***** original outputs
  * inputs[3] = C
  ***** gradient outputs
- * outputs[0] = dA
- * outputs[1] = dB
+ * outputs[0] = dA; outputs[1] = dB
  */
 MXReturnValue backward(std::map<std::string, std::string> attrs,
                        std::vector<MXTensor> inputs,
@@ -123,13 +114,17 @@ MXReturnValue backward(std::map<std::string, std::string> attrs,
 
   std::cout << "n: " << n << " k: " << k << " m: " << m << std::endl;
 
-  float At[n*k], Bt[k*m];
+  float *At = new float[n*k];
+  float *Bt = new float[k*m];
+
   transpose(A, At, n, k);
   transpose(B, Bt, k, m);
 
   gemm(dC, Bt, dA, n, k, m);
   gemm(At, dC, dB, n, k, m);
 
+  free(At);
+  free(Bt);
   return MX_SUCCESS;
 }
 
@@ -147,7 +142,6 @@ MXReturnValue inferType(std::map<std::string, std::string> attrs,
     std::cout << "Expected 2 inputs to inferType" << std::endl;
     return MX_FAIL;
   }
-
   if (intypes[0] != intypes[1]) {
     std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
     return MX_FAIL;
@@ -169,12 +163,10 @@ MXReturnValue inferShape(std::map<std::string, std::string> attrs,
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
     return MX_FAIL;
   }
-
   if (inshapes[0].size() != 2) {
     std::cout << "Expected 2D for first input to inferShape" << std::endl;
     return MX_FAIL;
   }
-
   if (inshapes[1].size() != 2) {
     std::cout << "Expected 2D for second input to inferShape" << std::endl;
     return MX_FAIL;
@@ -184,13 +176,12 @@ MXReturnValue inferShape(std::map<std::string, std::string> attrs,
   unsigned k = inshapes[0][1];
   unsigned kk = inshapes[1][0];
   unsigned m = inshapes[1][1];
+  if (k != kk)
+    return MX_FAIL;
 
   std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
   std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
 
-  if (k != kk)
-    return MX_FAIL;
-
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
 
@@ -204,35 +195,26 @@ REGISTER_OP(my_gemm)
 .setInferType(inferType)
 .setInferShape(inferShape);
 
-
 /* ------------------------------------------------------------------------- */
 
-MXReturnValue mutateInputs(std::map<std::string, std::string> attrs,
-                           std::vector<int> &input_indices) {
-  // input_indices.push_back(1);
-  // std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
-  return MX_SUCCESS;
-}
-
 class MyStatefulGemm : public CustomStatefulOp {
  public:
   explicit MyStatefulGemm(int count) : count(count) {}
 
   MXReturnValue Forward(std::vector<MXTensor> inputs,
-              std::vector<MXTensor> outputs,
-              OpResource op_res) {
-    count++;
+                        std::vector<MXTensor> outputs,
+                        OpResource op_res) {
     int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
-    *p = count;
-    std::cout << "test op resource " << *p << std::endl;
+    *p = ++count;
+    std::cout << "Op resource testing: " << *p << std::endl;
 
     std::map<std::string, std::string> attrs;
     return forward(attrs, inputs, outputs, op_res);
   }
 
   MXReturnValue Backward(std::vector<MXTensor> inputs,
-               std::vector<MXTensor> outputs,
-               OpResource op_res) {
+                         std::vector<MXTensor> outputs,
+                         OpResource op_res) {
     std::map<std::string, std::string> attrs;
     return backward(attrs, inputs, outputs, op_res);
   }
@@ -250,6 +232,13 @@ MXReturnValue createOpState(std::map<std::string, std::string> attrs,
   return MX_SUCCESS;
 }
 
+MXReturnValue mutateInputs(std::map<std::string, std::string> attrs,
+                           std::vector<int> &input_indices) {
+  // input_indices.push_back(1);
+  // std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+  return MX_SUCCESS;
+}
+
 REGISTER_OP(state_gemm)
 .setParseAttrs(parseAttrs)
 .setInferType(inferType)
@@ -257,7 +246,6 @@ REGISTER_OP(state_gemm)
 .setMutateInputs(mutateInputs)
 .setCreateOpState(createOpState);
 
-
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
     std::cout << "MXNet version " << version << " supported" << std::endl;
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 8e41e9623a14..bd8b370ed5ee 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -65,17 +65,13 @@ struct MXTensor {
   MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
   : data(data), shape(shape), dtype(dtype) {}
 
-  /*!
-   * \brief helper function to cast data pointer
-   */
+  /*! \brief helper function to cast data pointer */
   template<typename data_type>
   inline data_type* getData() {
     return reinterpret_cast<data_type*>(data);
   }
 
-  /*!
-   * \brief helper function to get data size
-   */
+  /*! \brief helper function to get data size */
   inline int64_t getDataSize() {
     int64_t size = 1;
     for (unsigned int i = 0; i < shape.size(); i++) {
@@ -111,12 +107,11 @@ class OpResource {
  public:
   OpResource(xpu_malloc_t xm, void* _xm) : xpu_malloc(xm), _xpu_malloc(_xm) {}
 
-  /*!
-   * \brief allocate memory controlled by MXNet
-   */
+  /*! \brief allocate memory controlled by MXNet */
   void* alloc(int size) {
     return xpu_malloc(_xpu_malloc, size);
   }
+
  private:
   xpu_malloc_t xpu_malloc;
   void* _xpu_malloc;
@@ -125,33 +120,42 @@ class OpResource {
 /*!
  * \brief Json utility to parse serialized subgraph symbol
  */
-// Macro to help passing serialized subgraph through attribute dict
+/*! \brief Macro to help passing serialized subgraph through attribute dict */
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
 
-// Types of JSON objects
+/*! \brief Types of JSON objects */
 enum json_type {ERR, STR, NUM, LIST, MAP};
 
-// definition of struct for JSON objects
+/*! \brief definition of JSON objects */
 struct json_val {
   json_val() : type(ERR), num(-1), str("") {}  // default constructor
-  explicit json_val(json_type t) : type(t), num(-1), str("") {}  // construct a JSON object by type
-  explicit json_val(std::string s) : type(STR), num(-1), str(s) {}  // construct a string JSON object
-  explicit json_val(int n) : type(NUM), num(n), str(std::to_string(n)) {}  // construct a number JSON object
-  json_val(json_type t, int n, std::string s) : type(t), num(n), str(s) {}  // complex constructor
+  // construct a JSON object by type
+  explicit json_val(json_type t) : type(t), num(-1), str("") {}
+  // construct a string JSON object
+  explicit json_val(std::string s) : type(STR), num(-1), str(s) {}
+  // construct a number JSON object
+  explicit json_val(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+  // complex constructor
+  json_val(json_type t, int n, std::string s) : type(t), num(n), str(s) {}
   bool operator<(const json_val &o) const {
-    if (type == STR) return type == o.type && str < o.str;  // for string JSON objects compare the string
-    if (type == NUM) return type == o.type && num < o.num;  // for number JSON objects compare the number
-    if (type == LIST) {  // for list JSON objects, compare the size of the list, and then each object in the lists
+    // for string JSON objects compare the string
+    if (type == STR) return type == o.type && str < o.str;
+    // for number JSON objects compare the number
+    if (type == NUM) return type == o.type && num < o.num;
+    // for list JSON objects, compare the size of list, and then each object in the list
+    if (type == LIST) {
       if (list.size() != o.list.size()) return false;
       for (unsigned int i=0; i< list.size(); i++)
         if (list[i] < o.list[i])
           return false;  // if we find an object that doesnt match return
       return true;  // all objects in lists matched
     }
-    if (type == MAP) {  // for map JSON objects, compare the size of the map, and then each key/value in the maps
+    // for map JSON objects, compare the size of map, and then each key/value in the maps
+    if (type == MAP) {
       if (map.size() != o.map.size()) return false;
       for (auto &item : map) {
-        if (o.map.find(item.first) == o.map.end()) return false;  // if one map is missing a key in another return
+        // if one map is missing a key in another return
+        if (o.map.find(item.first) == o.map.end()) return false;
         if (item.second < o.map.at(item.first)) return false;
       }
       return true;
@@ -165,6 +169,7 @@ struct json_val {
   std::map<json_val, json_val> map;
 };
 
+/*! \brief functions used for parsing JSON */
 struct Json_Parser {
   json_val parse_to_json(std::string json) {
     unsigned int idx = 0;
@@ -310,9 +315,7 @@ class CustomStatefulOp {
 
 CustomStatefulOp::~CustomStatefulOp() {}
 
-/*!
- * \brief StatefulOp wrapper class to pass to backend OpState
- */
+/*! \brief StatefulOp wrapper class to pass to backend OpState */
 class CustomStatefulOpWrapper {
  public:
   explicit CustomStatefulOpWrapper(CustomStatefulOp* inst) : instance(inst) {}
@@ -321,9 +324,7 @@ class CustomStatefulOpWrapper {
   CustomStatefulOp* instance;
 };
 
-/*!
- * Custom Operator function templates
- */
+/*! \brief Custom Operator function templates */
 typedef MXReturnValue (*fcomp_t)(std::map<std::string, std::string>,
                                  std::vector<MXTensor>, std::vector<MXTensor>,
                                  OpResource res);
@@ -391,7 +392,7 @@ class CustomOp {
 
 /*!
  * \brief Registry class to registers things (ops, properties)
- *       Singleton class
+ *        Singleton class
  */
 template <class T>
 class Registry {
@@ -429,29 +430,23 @@ class Registry {
   std::vector<T*> entries;
 };
 
-/*
- * Macros to help with string concat
+/*!
+ * \brief Macros to help with string concat
  * Annoyingly, the concat_ and concat macros are necessary to
  * be able to use __COUNTER__ in an identifier name 
  */
 #define _STR_CONCAT_(__a, __b) __a ## __b
 #define _STR_CONCAT(__a, __b) _STR_CONCAT_(__a, __b)
 
-/*!
- * \brief convert a token to a string
- */
+/*! \brief convert a token to a string */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
-/*!
- * \brief declare a variable with custom name
- */
+/*! \brief declare a variable with custom name */
 #define _REGISTER_NAME_(Name) MXNet ## _CustomOp ## _
 #define _REGISTER_DEF_(Name) CustomOp _REGISTER_NAME_(Name)
 
-/*!
- * \brief assign a var to a value
- */
+/*! \brief assign a var to a value */
 #define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
     Registry<CustomOp>::get()->add(TOSTRING(Name))
 
@@ -464,7 +459,6 @@ class Registry {
  * Each API has a #define string that is used to lookup the function in the library
  * Followed by the function declaration
  */
-
 #define MXLIB_OPREGSIZE_STR "_opRegSize"
 typedef int (*opRegSize_t)(void);
 
@@ -517,9 +511,7 @@ typedef int (*initialize_t)(int);
 typedef int (*opVersion_t)();
 
 extern "C" {
-  /*!
-   * \brief returns MXNet library version 
-   */
+  /*! \brief returns MXNet library version */
   #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -529,9 +521,7 @@ extern "C" {
     return MX_LIBRARY_VERSION;
   }
 
-  /*!
-   * \brief returns number of ops registered in this library
-   */
+  /*! \brief returns number of ops registered in this library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -541,9 +531,7 @@ extern "C" {
     return Registry<CustomOp>::get()->size();
   }
 
-  /*!
-   * \brief returns operator registration at specified index
-   */
+  /*! \brief returns operator registration at specified index */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) void __cdecl
 #else
@@ -564,10 +552,8 @@ extern "C" {
     *create_op = op.create_opstate;
   }
 
-  /*!
-   * \brief calls free from the external library for library allocated arrays
-   */
-  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  /*! \brief calls free from the external library for library allocated arrays */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) void __cdecl
 #else
   void
@@ -576,9 +562,7 @@ extern "C" {
     free(ptr);
   }
 
-  /*!
-   * \brief returns status of calling parse attributes function for operator from library
-   */
+  /*! \brief returns status of calling parse attributes function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -596,9 +580,7 @@ extern "C" {
     return parseAttrs(attrs, num_in, num_out);
   }
 
-  /*!
-   * \brief returns status of calling infer shape function for operator from library
-   */
+  /*! \brief returns status of calling inferShape function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -645,9 +627,7 @@ extern "C" {
     return retval;
   }
 
-  /*!
-   * \brief returns status of calling InferType function for operator from library
-   */
+  /*! \brief returns status of calling inferType function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -683,10 +663,7 @@ extern "C" {
     return retval;
   }
 
-  /*!
-   * \brief returns status of calling Forward function for operator from library
-   */
-
+  /*! \brief returns status of calling Forward function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -730,9 +707,7 @@ extern "C" {
     return fcomp(attrs, inputs, outputs, res);
   }
 
-  /*!
-   * \brief returns status of calling mutate inputs function for operator from library
-   */
+  /*! \brief returns status of calling mutateInputs function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -764,9 +739,7 @@ extern "C" {
     return retval;
   }
 
-  /*!
-   * \brief returns status of calling create stateful op function for operator from library
-   */
+  /*! \brief returns status of calling createStatefulOp function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 8cc5da7f3ef4..a21eb666ea68 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -634,44 +634,46 @@ int MXLoadLib(const char *path) {
     regOp.set_attr_parser(attr_parser);
     regOp.set_num_inputs(num_inputs);
     regOp.set_num_outputs(num_outputs);
-    int priority = 10;
+    int plevel = 10;
     if (regOpPtr != nullptr) {
       // overwrite registration of existing op with custom op
       regOp.arguments.clear();
       // set attribute with higher plevel (11) to allow re-registering once
       // TODO(samskalicky): enable constant overwriting of registertion multiple times
-      priority++;
+      plevel++;
     }
-    regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, priority);
-    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, priority);
-    regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, priority);
-    regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, priority);
+    regOp.set_attr<nnvm::FInferType>("FInferType", infer_type, plevel);
+    regOp.set_attr<mxnet::FInferShape>("FInferShape", infer_shape, plevel);
+    regOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, plevel);
+    regOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, plevel);
     // optionally add stateful forward
     if (create_opstate_fp != nullptr) {
-      regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, priority);
-      regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_forward, priority);
+      regOp.set_attr<FCreateOpState>("FCreateOpState", create_opstate, plevel);
+      regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>",
+                                        fstateful_forward, plevel);
     } else {
-      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, priority);
+      regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, plevel);
     }
     // optionally add fmutate inputs if user specified a function
     if (mutate_fp != nullptr)
-      regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, priority);
+      regOp.set_attr<nnvm::FMutateInputs>("FMutateInputs", mutate_inputs, plevel);
     // optionally add fgradient if user specified a function
     if (fgrad_fp != nullptr || create_opstate_fp != nullptr) {
-      regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, priority);
+      regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, plevel);
       std::string grad_name = "_backward_" + name_str;
       nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
-      gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, priority);
+      gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, plevel);
       gradOp.set_attr_parser(attr_parser);
       gradOp.set_num_inputs(num_inouts);
       gradOp.set_num_outputs(num_inputs);
-      gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, priority);
-      gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, priority);
+      gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, plevel);
+      gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, plevel);
       if (create_opstate_fp != nullptr) {
-        gradOp.set_attr<bool>("TIsLayerOpBackward", true, priority);
-        gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstateful_backward, priority);
+        gradOp.set_attr<bool>("TIsLayerOpBackward", true, plevel);
+        gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>",
+                                            fstateful_backward, plevel);
       } else {
-        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda, priority);
+        gradOp.set_attr<FComputeEx>("FComputeEx<cpu>", backward_lambda, plevel);
       }
     }
     regOp.add_argument("data", "NDArray[]", "Source inputs");

From c5a3ed6114e0ab4b216470c817dca7e9f1365a28 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Tue, 1 Oct 2019 00:23:30 +0000
Subject: [PATCH 079/111] add mxnet_extension dir

---
 CMakeLists.txt                                         |  2 +-
 Makefile                                               |  4 ++--
 example/{ => mxnet_extension}/lib_api/Makefile         |  8 ++++----
 .../mylib.cc => mxnet_extension/lib_api/init_lib.cc}   |  2 +-
 example/{ => mxnet_extension}/lib_api/libtest.cc       |  4 ++--
 .../lib_api/test_loading.py}                           |  4 ++--
 .../lib_custom_op}/Makefile                            | 10 ++--------
 .../lib_custom_op}/gemm_lib.cc                         |  0
 .../lib_custom_op}/subgraph_lib.cc                     |  0
 .../lib_custom_op}/test_gemm.py                        |  0
 .../lib_custom_op}/test_subgraph.py                    |  0
 11 files changed, 14 insertions(+), 20 deletions(-)
 rename example/{ => mxnet_extension}/lib_api/Makefile (80%)
 rename example/{lib_api/mylib.cc => mxnet_extension/lib_api/init_lib.cc} (98%)
 rename example/{ => mxnet_extension}/lib_api/libtest.cc (95%)
 rename example/{lib_api/test.py => mxnet_extension/lib_api/test_loading.py} (92%)
 rename example/{lib_ops => mxnet_extension/lib_custom_op}/Makefile (91%)
 rename example/{lib_ops => mxnet_extension/lib_custom_op}/gemm_lib.cc (100%)
 rename example/{lib_ops => mxnet_extension/lib_custom_op}/subgraph_lib.cc (100%)
 rename example/{lib_ops => mxnet_extension/lib_custom_op}/test_gemm.py (100%)
 rename example/{lib_ops => mxnet_extension/lib_custom_op}/test_subgraph.py (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80f49588cd4d..f1c39b966f14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -693,7 +693,7 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_ops/gemm_lib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/mxnet_extension/lib_custom_op/gemm_lib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
diff --git a/Makefile b/Makefile
index 2b20477e76f8..12feee183d54 100644
--- a/Makefile
+++ b/Makefile
@@ -668,9 +668,9 @@ cpplint:
 pylint:
 	python3 -m pylint --rcfile=$(ROOTDIR)/ci/other/pylintrc --ignore-patterns=".*\.so$$,.*\.dll$$,.*\.dylib$$" python/mxnet tools/caffe_converter/*.py
 
-# sample lib for dynamically loading custom operator
+# sample lib for MXNet extension dynamically loading custom operator
 sample_lib:
-	$(CXX) -shared -fPIC -std=gnu++0x example/lib_ops/gemm_lib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=gnu++0x example/mxnet_extension/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
 # Cython build
 cython:
diff --git a/example/lib_api/Makefile b/example/mxnet_extension/lib_api/Makefile
similarity index 80%
rename from example/lib_api/Makefile
rename to example/mxnet_extension/lib_api/Makefile
index a811f2250b3e..30bea5a622f5 100644
--- a/example/lib_api/Makefile
+++ b/example/mxnet_extension/lib_api/Makefile
@@ -16,16 +16,16 @@
 # under the License.
 
 all:
-	g++ -std=c++11 -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
+	g++ -std=c++11 -shared -fPIC init_lib.cc -o init_lib.so -I ../../../include/mxnet
 
 test:
-	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
+	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../../include/mxnet
 
 windows:
-	cl /LD mylib.cc
+	cl /LD init_lib.cc
 
 win_test:
 	cl libtest.cc
 
 clean:
-	rm -rf mylib.so libtest
+	rm -rf init_lib.so libtest
diff --git a/example/lib_api/mylib.cc b/example/mxnet_extension/lib_api/init_lib.cc
similarity index 98%
rename from example/lib_api/mylib.cc
rename to example/mxnet_extension/lib_api/init_lib.cc
index 048642332f16..6a040ffa2ecb 100644
--- a/example/lib_api/mylib.cc
+++ b/example/mxnet_extension/lib_api/init_lib.cc
@@ -19,7 +19,7 @@
 
 /*!
  * Copyright (c) 2015 by Contributors
- * \file mylib.cc
+ * \file init_lib.cc
  * \brief Sample library file
  */
 
diff --git a/example/lib_api/libtest.cc b/example/mxnet_extension/lib_api/libtest.cc
similarity index 95%
rename from example/lib_api/libtest.cc
rename to example/mxnet_extension/lib_api/libtest.cc
index 8bdf36c05d37..c214513b1785 100644
--- a/example/lib_api/libtest.cc
+++ b/example/mxnet_extension/lib_api/libtest.cc
@@ -40,10 +40,10 @@ int main(void) {
   // Get a handle to the library.
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   HINSTANCE handle;
-  handle = LoadLibrary(TEXT("mylib.dll"));
+  handle = LoadLibrary(TEXT("init_lib.dll"));
 #else
   void *handle;
-  handle = dlopen("mylib.so", RTLD_LAZY);
+  handle = dlopen("init_lib.so", RTLD_LAZY);
 #endif
 
   if (!handle) {
diff --git a/example/lib_api/test.py b/example/mxnet_extension/lib_api/test_loading.py
similarity index 92%
rename from example/lib_api/test.py
rename to example/mxnet_extension/lib_api/test_loading.py
index 840924c1317c..2325c50e827f 100644
--- a/example/lib_api/test.py
+++ b/example/mxnet_extension/lib_api/test_loading.py
@@ -26,8 +26,8 @@
 import os
 
 if (os.name=='posix'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('init_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('init_lib.so')
     mx.library.load(path)
diff --git a/example/lib_ops/Makefile b/example/mxnet_extension/lib_custom_op/Makefile
similarity index 91%
rename from example/lib_ops/Makefile
rename to example/mxnet_extension/lib_custom_op/Makefile
index 2818bf04d7cf..aa8c8954783f 100644
--- a/example/lib_ops/Makefile
+++ b/example/mxnet_extension/lib_custom_op/Makefile
@@ -18,16 +18,10 @@
 all: subgraph_lib gemm_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o gemm_lib.so -I ../../include/mxnet
+	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o gemm_lib.so -I ../../../include/mxnet
 
 subgraph_lib:
-	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
-
-windows:
-	cl /LD mylib.cc
-
-win_test:
-	cl libtest.cc
+	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../../include/mxnet
 
 clean:
 	rm -rf *.so
diff --git a/example/lib_ops/gemm_lib.cc b/example/mxnet_extension/lib_custom_op/gemm_lib.cc
similarity index 100%
rename from example/lib_ops/gemm_lib.cc
rename to example/mxnet_extension/lib_custom_op/gemm_lib.cc
diff --git a/example/lib_ops/subgraph_lib.cc b/example/mxnet_extension/lib_custom_op/subgraph_lib.cc
similarity index 100%
rename from example/lib_ops/subgraph_lib.cc
rename to example/mxnet_extension/lib_custom_op/subgraph_lib.cc
diff --git a/example/lib_ops/test_gemm.py b/example/mxnet_extension/lib_custom_op/test_gemm.py
similarity index 100%
rename from example/lib_ops/test_gemm.py
rename to example/mxnet_extension/lib_custom_op/test_gemm.py
diff --git a/example/lib_ops/test_subgraph.py b/example/mxnet_extension/lib_custom_op/test_subgraph.py
similarity index 100%
rename from example/lib_ops/test_subgraph.py
rename to example/mxnet_extension/lib_custom_op/test_subgraph.py

From 44affc7ed8003ebb7439039817ed76636f1f7453 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 1 Oct 2019 18:46:20 +0000
Subject: [PATCH 080/111] fixed extension to be dll for windows

---
 example/lib_api/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/lib_api/test.py b/example/lib_api/test.py
index 840924c1317c..9e5e4ca05c67 100644
--- a/example/lib_api/test.py
+++ b/example/lib_api/test.py
@@ -29,5 +29,5 @@
     path = os.path.abspath('mylib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('mylib.dll')
     mx.library.load(path)

From ef1d4cff87883480336813a7195fdcc8db8ed39e Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 1 Oct 2019 19:08:47 +0000
Subject: [PATCH 081/111] updated examples to use the same format as the
 example in the top-level Makefile: "lib<name>.so"

---
 example/lib_api/Makefile         | 4 ++--
 example/lib_api/test.py          | 4 ++--
 example/lib_ops/Makefile         | 4 ++--
 example/lib_ops/test_gemm.py     | 4 ++--
 example/lib_ops/test_subgraph.py | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/example/lib_api/Makefile b/example/lib_api/Makefile
index a811f2250b3e..9a8974692b2c 100644
--- a/example/lib_api/Makefile
+++ b/example/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -std=c++11 -shared -fPIC mylib.cc -o mylib.so -I ../../include/mxnet
+	g++ -std=c++11 -shared -fPIC mylib.cc -o libmylib.so -I ../../include/mxnet
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../include/mxnet
@@ -28,4 +28,4 @@ win_test:
 	cl libtest.cc
 
 clean:
-	rm -rf mylib.so libtest
+	rm -rf libmylib.so libtest
diff --git a/example/lib_api/test.py b/example/lib_api/test.py
index 9e5e4ca05c67..084dc348de15 100644
--- a/example/lib_api/test.py
+++ b/example/lib_api/test.py
@@ -26,8 +26,8 @@
 import os
 
 if (os.name=='posix'):
-    path = os.path.abspath('mylib.so')
+    path = os.path.abspath('libmylib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('mylib.dll')
+    path = os.path.abspath('libmylib.dll')
     mx.library.load(path)
diff --git a/example/lib_ops/Makefile b/example/lib_ops/Makefile
index 2818bf04d7cf..d78b78283bc4 100644
--- a/example/lib_ops/Makefile
+++ b/example/lib_ops/Makefile
@@ -18,10 +18,10 @@
 all: subgraph_lib gemm_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o gemm_lib.so -I ../../include/mxnet
+	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o libgemm_lib.so -I ../../include/mxnet
 
 subgraph_lib:
-	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o subgraph_lib.so -I ../../include/mxnet
+	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o libsubgraph_lib.so -I ../../include/mxnet
 
 windows:
 	cl /LD mylib.cc
diff --git a/example/lib_ops/test_gemm.py b/example/lib_ops/test_gemm.py
index 0adcad4981c2..49282746b740 100644
--- a/example/lib_ops/test_gemm.py
+++ b/example/lib_ops/test_gemm.py
@@ -28,10 +28,10 @@
 
 #load library
 if (os.name=='posix'):
-    path = os.path.abspath('gemm_lib.so')
+    path = os.path.abspath('libgemm_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('gemm_lib.so')
+    path = os.path.abspath('libgemm_lib.dll')
     mx.library.load(path)
 
 #setup inputs to call test operator
diff --git a/example/lib_ops/test_subgraph.py b/example/lib_ops/test_subgraph.py
index fe79ae07cee9..5d901986d95d 100644
--- a/example/lib_ops/test_subgraph.py
+++ b/example/lib_ops/test_subgraph.py
@@ -30,10 +30,10 @@
 
 # load library
 if (os.name=='posix'):
-    path = os.path.abspath('subgraph_lib.so')
+    path = os.path.abspath('libsubgraph_lib.so')
     mx.library.load(path)
 elif (os.name=='nt'):
-    path = os.path.abspath('subgraph_lib.so')
+    path = os.path.abspath('libsubgraph_lib.dll')
     mx.library.load(path)
 
 a = mx.sym.var('a')

From 24d8cc399d7b31a34bcd2b591581f24de4ba3098 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Tue, 1 Oct 2019 19:53:32 +0000
Subject: [PATCH 082/111] removed destructor for CustomStatefulOp

---
 include/mxnet/lib_api.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index bd8b370ed5ee..c57c7e9f109e 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -310,11 +310,8 @@ class CustomStatefulOp {
     std::cout << "Error! Operator does not support backward" << std::endl;
     return MX_FAIL;
   }
-  virtual ~CustomStatefulOp() = 0;
 };
 
-CustomStatefulOp::~CustomStatefulOp() {}
-
 /*! \brief StatefulOp wrapper class to pass to backend OpState */
 class CustomStatefulOpWrapper {
  public:

From 279a9896bcf7a1ec3f58efa7c3ee08815eb475bd Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 00:19:32 +0000
Subject: [PATCH 083/111] fix error in gemm test and clear up subgraph test

---
 .../mxnet_extension/lib_custom_op/gemm_lib.cc | 47 +++++++-----------
 .../lib_custom_op/subgraph_lib.cc             | 12 ++---
 .../lib_custom_op/test_gemm.py                | 49 +++++++++----------
 .../lib_custom_op/test_subgraph.py            | 14 ++----
 4 files changed, 49 insertions(+), 73 deletions(-)

diff --git a/example/mxnet_extension/lib_custom_op/gemm_lib.cc b/example/mxnet_extension/lib_custom_op/gemm_lib.cc
index fda6962d7e0f..0f65dc517162 100644
--- a/example/mxnet_extension/lib_custom_op/gemm_lib.cc
+++ b/example/mxnet_extension/lib_custom_op/gemm_lib.cc
@@ -20,7 +20,7 @@
 /*!
  * Copyright (c) 2019 by Contributors
  * \file gemm_lib.cc
- * \brief Sample custom operator implementation library file
+ * \brief Sample 2D gemm custom operator implementation library file
  */
 
 #include <iostream>
@@ -41,9 +41,9 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
 
 void transpose(float* A, float* At, unsigned n, unsigned m) {
   unsigned i, j;
-  for (i=0; i < n; i++) {
-    for (j=0; j < m; j++) {
-      At[i*n+j] = A[j*m+i];
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      At[i*m+j] = A[j*n+i];
     }
   }
 }
@@ -74,7 +74,6 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
   unsigned m = inputs[1].shape[1];
 
   gemm(A, B, C, n, k, m);
-
   return MX_SUCCESS;
 }
 
@@ -112,19 +111,16 @@ MXReturnValue backward(std::map<std::string, std::string> attrs,
   unsigned k = inputs[1].shape[1];
   unsigned m = inputs[2].shape[1];
 
-  std::cout << "n: " << n << " k: " << k << " m: " << m << std::endl;
-
-  float *At = new float[n*k];
-  float *Bt = new float[k*m];
-
-  transpose(A, At, n, k);
-  transpose(B, Bt, k, m);
+  float *At = new float[k*n];
+  float *Bt = new float[m*k];
 
-  gemm(dC, Bt, dA, n, k, m);
-  gemm(At, dC, dB, n, k, m);
+  transpose(A, At, k, n);
+  transpose(B, Bt, m, k);
+  gemm(dC, Bt, dA, n, m, k);
+  gemm(At, dC, dB, k, n, m);
 
-  free(At);
-  free(Bt);
+  delete[] At;
+  delete[] Bt;
   return MX_SUCCESS;
 }
 
@@ -148,10 +144,6 @@ MXReturnValue inferType(std::map<std::string, std::string> attrs,
   }
 
   outtypes[0] = intypes[0];
-
-  std::cout << "intypes[0]=" << intypes[0] << "  outtypes[0]=" << outtypes[0] << std::endl;
-  std::cout << "intypes=" << intypes.size() << "  outtypes=" << outtypes.size() << std::endl;
-
   return MX_SUCCESS;
 }
 
@@ -176,15 +168,13 @@ MXReturnValue inferShape(std::map<std::string, std::string> attrs,
   unsigned k = inshapes[0][1];
   unsigned kk = inshapes[1][0];
   unsigned m = inshapes[1][1];
-  if (k != kk)
+  if (k != kk) {
+    std::cout << "Exected first input axis 1 equals to second input axis 0" << std::endl;
     return MX_FAIL;
-
-  std::cout << "inshapes[0][0]=" << n << "  inshapes[0][1]=" << k << std::endl;
-  std::cout << "inshapes[1][0]=" << kk << "  inshapes[1][1]=" << m << std::endl;
+  }
 
   outshapes[0].push_back(n);
   outshapes[0].push_back(m);
-
   return MX_SUCCESS;
 }
 
@@ -206,7 +196,7 @@ class MyStatefulGemm : public CustomStatefulOp {
                         OpResource op_res) {
     int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
     *p = ++count;
-    std::cout << "Op resource testing: " << *p << std::endl;
+    std::cout << "Info: op resource testing: " << *p << std::endl;
 
     std::map<std::string, std::string> attrs;
     return forward(attrs, inputs, outputs, op_res);
@@ -228,14 +218,13 @@ class MyStatefulGemm : public CustomStatefulOp {
 MXReturnValue createOpState(std::map<std::string, std::string> attrs,
                             CustomStatefulOp** op_inst) {
   *op_inst = new MyStatefulGemm(58);
-  std::cout << "create op state successful" << std::endl;
+  std::cout << "Info: create op state successful" << std::endl;
   return MX_SUCCESS;
 }
 
 MXReturnValue mutateInputs(std::map<std::string, std::string> attrs,
                            std::vector<int> &input_indices) {
-  // input_indices.push_back(1);
-  // std::cout << "the 1st input is marked as mutate input by library author" << std::endl;
+  // input_indices.push_back(1);  // mark mutate input
   return MX_SUCCESS;
 }
 
diff --git a/example/mxnet_extension/lib_custom_op/subgraph_lib.cc b/example/mxnet_extension/lib_custom_op/subgraph_lib.cc
index 4e263516e0d4..6a9331043565 100644
--- a/example/mxnet_extension/lib_custom_op/subgraph_lib.cc
+++ b/example/mxnet_extension/lib_custom_op/subgraph_lib.cc
@@ -20,8 +20,7 @@
 /*!
  * Copyright (c) 2019 by Contributors
  * \file subgraph_lib.cc
- * \brief subgraph operator implementation
- * library file
+ * \brief subgraph operator implementation library file
  */
 
 #include <iostream>
@@ -32,9 +31,9 @@ MXReturnValue parseAttrs(std::map<std::string, std::string> attrs,
   *num_in = 1;
   *num_out = 1;
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
-    std::string serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
+    // example of subgraph json parsing
     Json_Parser jp;
-    json_val val = jp.parse_to_json(serialized_subgraph);
+    json_val val = jp.parse_to_json(attrs[SUBGRAPH_SYM_JSON]);
     int input = 0;
     for (auto &item : val.map[json_val("nodes")].list) {
       if (item.map[json_val("op")].str == "null")
@@ -68,7 +67,7 @@ class MyStatefulOp : public CustomStatefulOp {
   MXReturnValue Forward(std::vector<MXTensor> inputs,
                         std::vector<MXTensor> outputs,
                         OpResource op_res) {
-    std::cout << "subgraph " << subgraph_sym << " forwarding" << std::endl;
+    std::cout << "Info: subgraph symbol is: " << subgraph_sym << std::endl;
     float* in_data = inputs[0].getData<float>();
     float* out_data = outputs[0].getData<float>();
     for (int i = 0; i < inputs[0].getDataSize(); i++) {
@@ -87,10 +86,11 @@ MXReturnValue createOpState(std::map<std::string, std::string> attrs,
   // MXNet subgraph is stored as Symbol in operator node attrs subgraphs field
   // custom subgraph is stored as json string in custom operator attrs map entry
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
+    // user can now parse json and run other custom ops inside subgraph
     serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
   }
   *op_inst = new MyStatefulOp(serialized_subgraph);
-  std::cout << "create op state successful" << std::endl;
+  std::cout << "Info: create op state successful" << std::endl;
   return MX_SUCCESS;
 }
 
diff --git a/example/mxnet_extension/lib_custom_op/test_gemm.py b/example/mxnet_extension/lib_custom_op/test_gemm.py
index 0adcad4981c2..99d1541c00d9 100644
--- a/example/mxnet_extension/lib_custom_op/test_gemm.py
+++ b/example/mxnet_extension/lib_custom_op/test_gemm.py
@@ -20,42 +20,30 @@
 # coding: utf-8
 # pylint: disable=arguments-differ
 
-# This test checks if dynamic loading of library into MXNet is successful
-# and checks the end of end computation of custom operator
+# This test checks dynamic loading of custom library into MXNet
+# and checks end to end compute of a simple 2D gemm custom op
 
 import mxnet as mx
 import os
 
-#load library
-if (os.name=='posix'):
-    path = os.path.abspath('gemm_lib.so')
-    mx.library.load(path)
-elif (os.name=='nt'):
-    path = os.path.abspath('gemm_lib.so')
-    mx.library.load(path)
+path = os.path.abspath('gemm_lib.so')
+mx.library.load(path)
 
-#setup inputs to call test operator
-a = mx.nd.array([[1,2],[3,4]])
-b = mx.nd.array([[5,6],[7,8]])
+a = mx.nd.array([[1,2,3],[4,5,6]])
+b = mx.nd.array([[7],[8],[9]])
 
-#print inputs
-print(a)
-print(b)
-
-#compute and print output
-print("--------start ndarray---------")
+print("--------start ndarray compute---------")
 print(mx.nd.my_gemm(a,b))
 print(mx.nd.state_gemm(a,b))
 
-# symbolic compute
-print("---------start symbol--------")
+print("--------start symbolic compute--------")
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
 c = mx.sym.my_gemm(s,t)
 d = mx.sym.state_gemm(s,t)
 
-in_grad = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
-in_grad2 = [mx.nd.empty((2,2)),mx.nd.empty((2,2))]
+in_grad = [mx.nd.empty((2,3)),mx.nd.empty((3,1))]
+in_grad2 = [mx.nd.empty((2,3)),mx.nd.empty((3,1))]
 
 exe = c.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad)
 exe2 = d.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad2)
@@ -67,11 +55,18 @@
 out2 = exe2.forward()
 print(out2)
 
-print("---------start backward--------")
-out_grad = mx.nd.ones((2,2))
+# baseline forward
+e = mx.sym.linalg.gemm2(s,t)
+in_grad3 = [mx.nd.empty((2,3)),mx.nd.empty((3,1))]
+exe3 = e.bind(ctx=mx.cpu(),args={'s':a,'t':b},args_grad=in_grad3)
+out3 = exe3.forward()
+print(out3)
+
+print("--------start backward compute--------")
+out_grad = mx.nd.ones((2,1))
 exe.backward([out_grad])
 print(in_grad)
-
-out_grad2 = mx.nd.ones((2,2))
-exe2.backward([out_grad2])
+exe2.backward([out_grad])
 print(in_grad2)
+exe3.backward([out_grad])
+print(in_grad3)
diff --git a/example/mxnet_extension/lib_custom_op/test_subgraph.py b/example/mxnet_extension/lib_custom_op/test_subgraph.py
index fe79ae07cee9..e7d2e2227be8 100644
--- a/example/mxnet_extension/lib_custom_op/test_subgraph.py
+++ b/example/mxnet_extension/lib_custom_op/test_subgraph.py
@@ -24,17 +24,11 @@
 # and checks the end of end computation of custom operator
 
 import mxnet as mx
-import os
+import os, ctypes
 from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle
-import ctypes
 
-# load library
-if (os.name=='posix'):
-    path = os.path.abspath('subgraph_lib.so')
-    mx.library.load(path)
-elif (os.name=='nt'):
-    path = os.path.abspath('subgraph_lib.so')
-    mx.library.load(path)
+path = os.path.abspath('subgraph_lib.so')
+mx.library.load(path)
 
 a = mx.sym.var('a')
 b = mx.sym.var('b')
@@ -53,9 +47,7 @@
 partitioned_sym = mx.sym.Symbol(out)
 json_sym = partitioned_sym.tojson()
 
-mystr = json_sym
 mystr = json_sym.replace("_CachedOp","_custom_subgraph_op")
-
 mysym = mx.sym.load_json(mystr)
 
 exe = mysym.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))})

From 75b1169dad4d5f9e2453891a98e5f2a1cf05a6bd Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 00:48:33 +0000
Subject: [PATCH 084/111] lib path fix

---
 example/mxnet_extension/lib_api/libtest.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/mxnet_extension/lib_api/libtest.cc b/example/mxnet_extension/lib_api/libtest.cc
index c214513b1785..0b2c6f64789c 100644
--- a/example/mxnet_extension/lib_api/libtest.cc
+++ b/example/mxnet_extension/lib_api/libtest.cc
@@ -40,10 +40,10 @@ int main(void) {
   // Get a handle to the library.
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   HINSTANCE handle;
-  handle = LoadLibrary(TEXT("init_lib.dll"));
+  handle = LoadLibrary(TEXT("libinit_lib.dll"));
 #else
   void *handle;
-  handle = dlopen("init_lib.so", RTLD_LAZY);
+  handle = dlopen("libinit_lib.so", RTLD_LAZY);
 #endif
 
   if (!handle) {

From 79c0e3a7104d086f1b8e9ea9e662c10181f19788 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 02:20:11 +0000
Subject: [PATCH 085/111] add unittest for custom op

---
 ...ary_loading.py => test_mxnet_extension.py} | 41 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 2 deletions(-)
 rename tests/python/unittest/{test_library_loading.py => test_mxnet_extension.py} (53%)

diff --git a/tests/python/unittest/test_library_loading.py b/tests/python/unittest/test_mxnet_extension.py
similarity index 53%
rename from tests/python/unittest/test_library_loading.py
rename to tests/python/unittest/test_mxnet_extension.py
index 29b99dacdbe1..0224e967fcdd 100644
--- a/tests/python/unittest/test_library_loading.py
+++ b/tests/python/unittest/test_mxnet_extension.py
@@ -21,15 +21,16 @@
 import platform
 import unittest
 import mxnet as mx
+import numpy as np
 from mxnet.base import MXNetError
-from mxnet.test_utils import download, is_cd_run
+from mxnet.test_utils import download, is_cd_run, assert_almost_equal
 
 def check_platform():
     return platform.machine() not in ['x86_64', 'AMD64']
 
 @unittest.skipIf(check_platform(), "not all machine types supported")
 @unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
-def test_library_loading():
+def test_custom_op():
     if (os.name=='posix'):
         lib = 'libsample_lib.so'
         if os.path.exists(lib):
@@ -47,3 +48,39 @@ def test_library_loading():
 
     fname = os.path.abspath(fname)
     mx.library.load(fname)
+
+    # test simple 2D gemm custom op loaded from sample library
+    s = mx.sym.Variable('s')
+    t = mx.sym.Variable('t')
+    c = mx.sym.my_gemm(s,t)
+    d = mx.sym.state_gemm(s,t)
+    base = mx.sym.linalg.gemm2(s,t)  # baseline
+
+    dim_n, dim_k, dim_m = tuple(np.random.randint(1, 5, size=3))
+
+    mat1 = mx.nd.random.uniform(-10, 10, shape=(dim_n, dim_k))
+    mat2 = mx.nd.random.uniform(-10, 10, shape=(dim_k, dim_m))
+
+    in_grad1 = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
+    in_grad2 = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
+    in_grad_base = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
+
+    exe1 = c.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad1)
+    exe2 = d.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad2)
+    exe_base = base.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad_base)
+
+    out1 = exe1.forward()
+    out2 = exe2.forward()
+    out2 = exe2.forward()  # stateful
+    out_base = exe_base.forward()
+
+    assert_almost_equal(out_base[0].asnumpy(), out1[0].asnumpy())
+    assert_almost_equal(out_base[0].asnumpy(), out2[0].asnumpy())
+
+    out_grad = mx.nd.ones((dim_n, dim_m))
+    exe1.backward([out_grad])
+    exe2.backward([out_grad])
+    exe_base.backward([out_grad])
+
+    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad1[0].asnumpy())
+    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad2[0].asnumpy())

From 11d334472cfdce18ce59d016752cee820f3f1669 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 02:29:48 +0000
Subject: [PATCH 086/111] update Makefile revolve merge

---
 Makefile | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/Makefile b/Makefile
index 22cd94c63d83..12feee183d54 100644
--- a/Makefile
+++ b/Makefile
@@ -672,17 +672,6 @@ pylint:
 sample_lib:
 	$(CXX) -shared -fPIC -std=gnu++0x example/mxnet_extension/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
-doc: docs
-
-docs:
-	make -C docs html
-
-clean_docs:
-	make -C docs clean
-
-doxygen:
-	doxygen docs/Doxyfile
-
 # Cython build
 cython:
 	cd python; $(PYTHON) setup.py build_ext --inplace --with-cython

From 9504b335e83ebe62f1ac72651ef8e7a1c67b713e Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 07:20:34 +0000
Subject: [PATCH 087/111] fix test and rename folder

---
 example/{mxnet_extension => extensions}/lib_api/Makefile  | 0
 .../{mxnet_extension => extensions}/lib_api/init_lib.cc   | 0
 .../{mxnet_extension => extensions}/lib_api/libtest.cc    | 0
 .../lib_api/test_loading.py                               | 0
 .../lib_custom_op/Makefile                                | 0
 .../lib_custom_op/gemm_lib.cc                             | 0
 .../lib_custom_op/subgraph_lib.cc                         | 0
 .../lib_custom_op/test_gemm.py                            | 0
 .../lib_custom_op/test_subgraph.py                        | 0
 tests/python/gpu/test_operator_gpu.py                     | 2 +-
 .../{test_mxnet_extension.py => test_extensions.py}       | 8 ++++----
 11 files changed, 5 insertions(+), 5 deletions(-)
 rename example/{mxnet_extension => extensions}/lib_api/Makefile (100%)
 rename example/{mxnet_extension => extensions}/lib_api/init_lib.cc (100%)
 rename example/{mxnet_extension => extensions}/lib_api/libtest.cc (100%)
 rename example/{mxnet_extension => extensions}/lib_api/test_loading.py (100%)
 rename example/{mxnet_extension => extensions}/lib_custom_op/Makefile (100%)
 rename example/{mxnet_extension => extensions}/lib_custom_op/gemm_lib.cc (100%)
 rename example/{mxnet_extension => extensions}/lib_custom_op/subgraph_lib.cc (100%)
 rename example/{mxnet_extension => extensions}/lib_custom_op/test_gemm.py (100%)
 rename example/{mxnet_extension => extensions}/lib_custom_op/test_subgraph.py (100%)
 rename tests/python/unittest/{test_mxnet_extension.py => test_extensions.py} (96%)

diff --git a/example/mxnet_extension/lib_api/Makefile b/example/extensions/lib_api/Makefile
similarity index 100%
rename from example/mxnet_extension/lib_api/Makefile
rename to example/extensions/lib_api/Makefile
diff --git a/example/mxnet_extension/lib_api/init_lib.cc b/example/extensions/lib_api/init_lib.cc
similarity index 100%
rename from example/mxnet_extension/lib_api/init_lib.cc
rename to example/extensions/lib_api/init_lib.cc
diff --git a/example/mxnet_extension/lib_api/libtest.cc b/example/extensions/lib_api/libtest.cc
similarity index 100%
rename from example/mxnet_extension/lib_api/libtest.cc
rename to example/extensions/lib_api/libtest.cc
diff --git a/example/mxnet_extension/lib_api/test_loading.py b/example/extensions/lib_api/test_loading.py
similarity index 100%
rename from example/mxnet_extension/lib_api/test_loading.py
rename to example/extensions/lib_api/test_loading.py
diff --git a/example/mxnet_extension/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
similarity index 100%
rename from example/mxnet_extension/lib_custom_op/Makefile
rename to example/extensions/lib_custom_op/Makefile
diff --git a/example/mxnet_extension/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
similarity index 100%
rename from example/mxnet_extension/lib_custom_op/gemm_lib.cc
rename to example/extensions/lib_custom_op/gemm_lib.cc
diff --git a/example/mxnet_extension/lib_custom_op/subgraph_lib.cc b/example/extensions/lib_custom_op/subgraph_lib.cc
similarity index 100%
rename from example/mxnet_extension/lib_custom_op/subgraph_lib.cc
rename to example/extensions/lib_custom_op/subgraph_lib.cc
diff --git a/example/mxnet_extension/lib_custom_op/test_gemm.py b/example/extensions/lib_custom_op/test_gemm.py
similarity index 100%
rename from example/mxnet_extension/lib_custom_op/test_gemm.py
rename to example/extensions/lib_custom_op/test_gemm.py
diff --git a/example/mxnet_extension/lib_custom_op/test_subgraph.py b/example/extensions/lib_custom_op/test_subgraph.py
similarity index 100%
rename from example/mxnet_extension/lib_custom_op/test_subgraph.py
rename to example/extensions/lib_custom_op/test_subgraph.py
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7aac23acd549..55a612df8773 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -48,7 +48,7 @@
 from test_subgraph_op import *
 from test_contrib_operator import test_multibox_target_op
 from test_tvm_op import *
-from test_library_loading import *
+from test_extensions import *
 
 set_default_context(mx.gpu(0))
 del test_support_vector_machine_l1_svm  # noqa
diff --git a/tests/python/unittest/test_mxnet_extension.py b/tests/python/unittest/test_extensions.py
similarity index 96%
rename from tests/python/unittest/test_mxnet_extension.py
rename to tests/python/unittest/test_extensions.py
index 0224e967fcdd..6870a8735c14 100644
--- a/tests/python/unittest/test_mxnet_extension.py
+++ b/tests/python/unittest/test_extensions.py
@@ -74,13 +74,13 @@ def test_custom_op():
     out2 = exe2.forward()  # stateful
     out_base = exe_base.forward()
 
-    assert_almost_equal(out_base[0].asnumpy(), out1[0].asnumpy())
-    assert_almost_equal(out_base[0].asnumpy(), out2[0].asnumpy())
+    assert_almost_equal(out_base[0].asnumpy(), out1[0].asnumpy(), rtol=1e-3, atol=1e-3)
+    assert_almost_equal(out_base[0].asnumpy(), out2[0].asnumpy(), rtol=1e-3, atol=1e-3)
 
     out_grad = mx.nd.ones((dim_n, dim_m))
     exe1.backward([out_grad])
     exe2.backward([out_grad])
     exe_base.backward([out_grad])
 
-    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad1[0].asnumpy())
-    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad2[0].asnumpy())
+    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad1[0].asnumpy(), rtol=1e-3, atol=1e-3)
+    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad2[0].asnumpy(), rtol=1e-3, atol=1e-3)

From cf27d570bad6e047c3e7e54b6a6b207a25a0339b Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 07:41:27 +0000
Subject: [PATCH 088/111] fix makefile rename

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 12feee183d54..be24eea44b60 100644
--- a/Makefile
+++ b/Makefile
@@ -670,7 +670,7 @@ pylint:
 
 # sample lib for MXNet extension dynamically loading custom operator
 sample_lib:
-	$(CXX) -shared -fPIC -std=gnu++0x example/mxnet_extension/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=gnu++0x example/extensions/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
 # Cython build
 cython:

From 7f456d4fedb520edfeac020d4f04b486213e36e8 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 07:45:49 +0000
Subject: [PATCH 089/111] fix cmake rename

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1c39b966f14..b103b60827a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -693,7 +693,7 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/mxnet_extension/lib_custom_op/gemm_lib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)

From e50819b60d1d087414fd537ba671d8e0334a02c8 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 2 Oct 2019 18:20:15 +0000
Subject: [PATCH 090/111] add explicit cpu context

---
 tests/python/unittest/test_extensions.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 6870a8735c14..cc7858dce0fd 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -58,12 +58,12 @@ def test_custom_op():
 
     dim_n, dim_k, dim_m = tuple(np.random.randint(1, 5, size=3))
 
-    mat1 = mx.nd.random.uniform(-10, 10, shape=(dim_n, dim_k))
-    mat2 = mx.nd.random.uniform(-10, 10, shape=(dim_k, dim_m))
+    mat1 = mx.nd.random.uniform(-10, 10, shape=(dim_n, dim_k), ctx=mx.cpu())
+    mat2 = mx.nd.random.uniform(-10, 10, shape=(dim_k, dim_m), ctx=mx.cpu())
 
-    in_grad1 = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
-    in_grad2 = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
-    in_grad_base = [mx.nd.empty((dim_n,dim_k)),mx.nd.empty((dim_k,dim_m))]
+    in_grad1 = [mx.nd.empty((dim_n,dim_k),ctx=mx.cpu()),mx.nd.empty((dim_k,dim_m),ctx=mx.cpu())]
+    in_grad2 = [mx.nd.empty((dim_n,dim_k),ctx=mx.cpu()),mx.nd.empty((dim_k,dim_m),ctx=mx.cpu())]
+    in_grad_base = [mx.nd.empty((dim_n,dim_k),ctx=mx.cpu()),mx.nd.empty((dim_k,dim_m),ctx=mx.cpu())]
 
     exe1 = c.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad1)
     exe2 = d.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad2)
@@ -77,7 +77,7 @@ def test_custom_op():
     assert_almost_equal(out_base[0].asnumpy(), out1[0].asnumpy(), rtol=1e-3, atol=1e-3)
     assert_almost_equal(out_base[0].asnumpy(), out2[0].asnumpy(), rtol=1e-3, atol=1e-3)
 
-    out_grad = mx.nd.ones((dim_n, dim_m))
+    out_grad = mx.nd.ones((dim_n, dim_m), ctx=mx.cpu())
     exe1.backward([out_grad])
     exe2.backward([out_grad])
     exe_base.backward([out_grad])

From bd2c3a0b5dc5ce7d9c91c6e761a3163ffcb5f933 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Thu, 3 Oct 2019 21:01:41 +0000
Subject: [PATCH 091/111] wkcn feedback: change mxtensor func name. use c++11
 flag

---
 Makefile                                      |  2 +-
 example/extensions/lib_custom_op/Makefile     |  6 +--
 example/extensions/lib_custom_op/gemm_lib.cc  | 45 +++++++------------
 .../extensions/lib_custom_op/subgraph_lib.cc  |  6 +--
 include/mxnet/lib_api.h                       | 18 ++++----
 src/c_api/c_api.cc                            |  4 +-
 6 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/Makefile b/Makefile
index be24eea44b60..c8153fc53d0a 100644
--- a/Makefile
+++ b/Makefile
@@ -670,7 +670,7 @@ pylint:
 
 # sample lib for MXNet extension dynamically loading custom operator
 sample_lib:
-	$(CXX) -shared -fPIC -std=gnu++0x example/extensions/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
+	$(CXX) -shared -fPIC -std=c++11 example/extensions/lib_custom_op/gemm_lib.cc -o libsample_lib.so -I include/mxnet
 
 # Cython build
 cython:
diff --git a/example/extensions/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
index 34aed449d7a1..66079a16a338 100644
--- a/example/extensions/lib_custom_op/Makefile
+++ b/example/extensions/lib_custom_op/Makefile
@@ -18,10 +18,10 @@
 all: subgraph_lib gemm_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=gnu++0x gemm_lib.cc -o libgemm_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 gemm_lib.cc -o libgemm_lib.so -I ../../../include/mxnet
 
 subgraph_lib:
-	g++ -shared -fPIC -std=gnu++0x subgraph_lib.cc -o libsubgraph_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 subgraph_lib.cc -o libsubgraph_lib.so -I ../../../include/mxnet
 
 clean:
-	rm -rf *.so
+	rm -rf libsubgraph_lib.so libgemm_lib.so
diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 0f65dc517162..dca37cc1d643 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -27,7 +27,8 @@
 #include "lib_api.h"
 
 // main matrix multiplication routine
-void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
+void gemm(const float* A, const float* B, float* C,
+          const unsigned n, const unsigned k, const unsigned m) {
   unsigned i, j, kk;
   for (i = 0; i < n; i++) {
     for (j = 0; j < m; j++) {
@@ -39,7 +40,7 @@ void gemm(float* A, float* B, float* C, unsigned n, unsigned k, unsigned m) {
   }
 }
 
-void transpose(float* A, float* At, unsigned n, unsigned m) {
+void transpose(const float* A, float* At, const unsigned n, const unsigned m) {
   unsigned i, j;
   for (i = 0; i < n; i++) {
     for (j = 0; j < m; j++) {
@@ -56,18 +57,10 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
                       std::vector<MXTensor> inputs,
                       std::vector<MXTensor> outputs,
                       OpResource res) {
-  // validate inputs
-  for (unsigned i = 0; i < inputs.size(); i++) {
-    if (inputs[i].dtype != kFloat32) {
-      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
-      return MX_FAIL;
-    }
-  }
-
   // extract data pointers from tensors
-  float* A = inputs[0].getData<float>();
-  float* B = inputs[1].getData<float>();
-  float* C = outputs[0].getData<float>();
+  float* A = inputs[0].data<float>();
+  float* B = inputs[1].data<float>();
+  float* C = outputs[0].data<float>();
   // set tensor shapes
   unsigned n = inputs[0].shape[0];
   unsigned k = inputs[0].shape[1];
@@ -92,20 +85,12 @@ MXReturnValue backward(std::map<std::string, std::string> attrs,
                        std::vector<MXTensor> inputs,
                        std::vector<MXTensor> outputs,
                        OpResource res) {
-  // validate inputs
-  for (unsigned i = 0; i < inputs.size(); i++) {
-    if (inputs[i].dtype != kFloat32) {
-      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
-      return MX_FAIL;
-    }
-  }
-
   // extract data pointers from tensors
-  float* dC = inputs[0].getData<float>();
-  float* A = inputs[1].getData<float>();
-  float* B = inputs[2].getData<float>();
-  float* dA = outputs[0].getData<float>();
-  float* dB = outputs[1].getData<float>();
+  float* dC = inputs[0].data<float>();
+  float* A = inputs[1].data<float>();
+  float* B = inputs[2].data<float>();
+  float* dA = outputs[0].data<float>();
+  float* dB = outputs[1].data<float>();
   // set tensor shapes
   unsigned n = inputs[1].shape[0];
   unsigned k = inputs[1].shape[1];
@@ -138,9 +123,11 @@ MXReturnValue inferType(std::map<std::string, std::string> attrs,
     std::cout << "Expected 2 inputs to inferType" << std::endl;
     return MX_FAIL;
   }
-  if (intypes[0] != intypes[1]) {
-    std::cout << "Expected 2 inputs to have same data type for inferType" << std::endl;
-    return MX_FAIL;
+  for (unsigned i = 0; i < intypes.size(); i++) {
+    if (intypes[i] != kFloat32) {
+      std::cout << "Expected input " << i << " to have float32 type" << std::endl;
+      return MX_FAIL;
+    }
   }
 
   outtypes[0] = intypes[0];
diff --git a/example/extensions/lib_custom_op/subgraph_lib.cc b/example/extensions/lib_custom_op/subgraph_lib.cc
index 6a9331043565..3537f0c890ee 100644
--- a/example/extensions/lib_custom_op/subgraph_lib.cc
+++ b/example/extensions/lib_custom_op/subgraph_lib.cc
@@ -68,9 +68,9 @@ class MyStatefulOp : public CustomStatefulOp {
                         std::vector<MXTensor> outputs,
                         OpResource op_res) {
     std::cout << "Info: subgraph symbol is: " << subgraph_sym << std::endl;
-    float* in_data = inputs[0].getData<float>();
-    float* out_data = outputs[0].getData<float>();
-    for (int i = 0; i < inputs[0].getDataSize(); i++) {
+    float* in_data = inputs[0].data<float>();
+    float* out_data = outputs[0].data<float>();
+    for (int i = 0; i < inputs[0].size(); i++) {
       out_data[i] = in_data[i];
     }
     return MX_SUCCESS;
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c57c7e9f109e..56d94911f699 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -60,19 +60,19 @@ enum MXReturnValue {
  * \brief External Tensor data structure
  */
 struct MXTensor {
-  MXTensor() : data(NULL) {}
+  MXTensor() : data_ptr(NULL) {}
 
-  MXTensor(void *data, const std::vector<int64_t> &shape, MXDType dtype)
-  : data(data), shape(shape), dtype(dtype) {}
+  MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype)
+  : data_ptr(data_ptr), shape(shape), dtype(dtype) {}
 
   /*! \brief helper function to cast data pointer */
   template<typename data_type>
-  inline data_type* getData() {
-    return reinterpret_cast<data_type*>(data);
+  inline data_type* data() {
+    return reinterpret_cast<data_type*>(data_ptr);
   }
 
   /*! \brief helper function to get data size */
-  inline int64_t getDataSize() {
+  inline int64_t size() {
     int64_t size = 1;
     for (unsigned int i = 0; i < shape.size(); i++) {
       size *= shape[i];
@@ -83,7 +83,7 @@ struct MXTensor {
   // data is flatten 1D repr of tensor, elements are in continuous memory
   // user can access each element using the shape of tensor
   // it may also point to data allocated on gpu
-  void *data;
+  void *data_ptr;
 
   // shape is in [2,3,4] format to represent high-dim tensor
   std::vector<int64_t> shape;
@@ -682,7 +682,7 @@ extern "C" {
     // create a vector of tensors for inputs
     std::vector<MXTensor> inputs(num_in);
     for (int i = 0; i < num_in; i++) {
-      inputs[i].data = indata[i];
+      inputs[i].data_ptr = indata[i];
       inputs[i].dtype = (MXDType)intypes[i];
       for (int j = 0; j < indims[i]; j++) {
         inputs[i].shape.push_back(inshapes[i][j]);
@@ -692,7 +692,7 @@ extern "C" {
     // create a vector of tensors for outputs
     std::vector<MXTensor> outputs(num_out);
     for (int i = 0; i < num_out; i++) {
-      outputs[i].data = outdata[i];
+      outputs[i].data_ptr = outdata[i];
       outputs[i].dtype = (MXDType) outtypes[i];
       for (int j = 0; j < outdims[i]; j++) {
         outputs[i].shape.push_back(outshapes[i][j]);
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 289867b00616..129fd599d297 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -556,7 +556,7 @@ int MXLoadLib(const char *path) {
       // create a vector of tensors for inputs
       std::vector<MXTensor> c_inputs(inputs.size());
       for (size_t i = 0; i < inputs.size(); i++) {
-        c_inputs[i].data = inputs[i].data().dptr_;
+        c_inputs[i].data_ptr = inputs[i].data().dptr_;
         c_inputs[i].dtype = (MXDType)inputs[i].dtype();
         for (int_least16_t j = 0; j < inputs[i].shape().ndim(); j++) {
           c_inputs[i].shape.push_back(inputs[i].shape().data()[j]);
@@ -566,7 +566,7 @@ int MXLoadLib(const char *path) {
       // create a vector of tensors for outputs
       std::vector<MXTensor> c_outputs(outputs.size());
       for (size_t i = 0; i < outputs.size(); i++) {
-        c_outputs[i].data = outputs[i].data().dptr_;
+        c_outputs[i].data_ptr = outputs[i].data().dptr_;
         c_outputs[i].dtype = (MXDType)outputs[i].dtype();
         for (int j = 0; j < outputs[i].shape().ndim(); j++) {
           c_outputs[i].shape.push_back(outputs[i].shape().data()[j]);

From b07e46bea1c07eaa34896dbed1861fc59278f8e4 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Thu, 3 Oct 2019 21:37:04 +0000
Subject: [PATCH 092/111] add operator keyward test and refine info print

---
 example/extensions/lib_custom_op/gemm_lib.cc     | 9 ++++++---
 example/extensions/lib_custom_op/subgraph_lib.cc | 6 ++++--
 example/extensions/lib_custom_op/test_gemm.py    | 9 +++++++--
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index dca37cc1d643..f3f24219cf8a 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -183,7 +183,7 @@ class MyStatefulGemm : public CustomStatefulOp {
                         OpResource op_res) {
     int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
     *p = ++count;
-    std::cout << "Info: op resource testing: " << *p << std::endl;
+    std::cout << "Info: cpu malloc test: keyword + number of forward: " << *p << std::endl;
 
     std::map<std::string, std::string> attrs;
     return forward(attrs, inputs, outputs, op_res);
@@ -204,8 +204,11 @@ class MyStatefulGemm : public CustomStatefulOp {
 
 MXReturnValue createOpState(std::map<std::string, std::string> attrs,
                             CustomStatefulOp** op_inst) {
-  *op_inst = new MyStatefulGemm(58);
-  std::cout << "Info: create op state successful" << std::endl;
+  int count = 0;
+  if (attrs.count("test_kw") > 0)
+    count = std::stoi(attrs["test_kw"]);
+  *op_inst = new MyStatefulGemm(count);
+  std::cout << "Info: stateful operator created" << std::endl;
   return MX_SUCCESS;
 }
 
diff --git a/example/extensions/lib_custom_op/subgraph_lib.cc b/example/extensions/lib_custom_op/subgraph_lib.cc
index 3537f0c890ee..a6e4cfc727e4 100644
--- a/example/extensions/lib_custom_op/subgraph_lib.cc
+++ b/example/extensions/lib_custom_op/subgraph_lib.cc
@@ -67,9 +67,11 @@ class MyStatefulOp : public CustomStatefulOp {
   MXReturnValue Forward(std::vector<MXTensor> inputs,
                         std::vector<MXTensor> outputs,
                         OpResource op_res) {
-    std::cout << "Info: subgraph symbol is: " << subgraph_sym << std::endl;
+    std::cout << "Info: subgraph symbol is: " << std::endl;
+    std::cout << subgraph_sym << std::endl;
     float* in_data = inputs[0].data<float>();
     float* out_data = outputs[0].data<float>();
+    std::cout << "Info: output is: " << std::endl;
     for (int i = 0; i < inputs[0].size(); i++) {
       out_data[i] = in_data[i];
     }
@@ -90,7 +92,7 @@ MXReturnValue createOpState(std::map<std::string, std::string> attrs,
     serialized_subgraph = attrs[SUBGRAPH_SYM_JSON];
   }
   *op_inst = new MyStatefulOp(serialized_subgraph);
-  std::cout << "Info: create op state successful" << std::endl;
+  std::cout << "Info: stateful operator created" << std::endl;
   return MX_SUCCESS;
 }
 
diff --git a/example/extensions/lib_custom_op/test_gemm.py b/example/extensions/lib_custom_op/test_gemm.py
index 17df4cc1d876..9a588255032f 100644
--- a/example/extensions/lib_custom_op/test_gemm.py
+++ b/example/extensions/lib_custom_op/test_gemm.py
@@ -39,13 +39,14 @@
 
 print("--------start ndarray compute---------")
 print(mx.nd.my_gemm(a,b))
-print(mx.nd.state_gemm(a,b))
+print("--------")
+print(mx.nd.state_gemm(a,b,test_kw=100))
 
 print("--------start symbolic compute--------")
 s = mx.sym.Variable('s')
 t = mx.sym.Variable('t')
 c = mx.sym.my_gemm(s,t)
-d = mx.sym.state_gemm(s,t)
+d = mx.sym.state_gemm(s,t,test_kw=200)
 
 in_grad = [mx.nd.empty((2,3)),mx.nd.empty((3,1))]
 in_grad2 = [mx.nd.empty((2,3)),mx.nd.empty((3,1))]
@@ -55,10 +56,12 @@
 
 out = exe.forward()
 print(out)
+print("-------")
 
 out2 = exe2.forward()
 out2 = exe2.forward()
 print(out2)
+print("-------")
 
 # baseline forward
 e = mx.sym.linalg.gemm2(s,t)
@@ -71,7 +74,9 @@
 out_grad = mx.nd.ones((2,1))
 exe.backward([out_grad])
 print(in_grad)
+print("-------")
 exe2.backward([out_grad])
 print(in_grad2)
+print("-------")
 exe3.backward([out_grad])
 print(in_grad3)

From 2466d67203873df1c834d7da7b4370fd48f60778 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Thu, 3 Oct 2019 23:42:57 +0000
Subject: [PATCH 093/111] using typedef in forward

---
 example/extensions/lib_custom_op/gemm_lib.cc | 24 ++++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index f3f24219cf8a..1452b5289119 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -57,16 +57,20 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
                       std::vector<MXTensor> inputs,
                       std::vector<MXTensor> outputs,
                       OpResource res) {
-  // extract data pointers from tensors
-  float* A = inputs[0].data<float>();
-  float* B = inputs[1].data<float>();
-  float* C = outputs[0].data<float>();
-  // set tensor shapes
-  unsigned n = inputs[0].shape[0];
-  unsigned k = inputs[0].shape[1];
-  unsigned m = inputs[1].shape[1];
-
-  gemm(A, B, C, n, k, m);
+  // simple example of using runtime data type
+  if (inputs[0].dtype == kFloat32) {
+    typedef float DType;
+    // extract data pointers from tensors
+    DType* A = inputs[0].data<DType>();
+    DType* B = inputs[1].data<DType>();
+    DType* C = outputs[0].data<DType>();
+    // set tensor shapes
+    unsigned n = inputs[0].shape[0];
+    unsigned k = inputs[0].shape[1];
+    unsigned m = inputs[1].shape[1];
+
+    gemm(A, B, C, n, k, m);
+  }
   return MX_SUCCESS;
 }
 

From e0414002f2361d6e8c88a1bec4ad025995abfd00 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Fri, 4 Oct 2019 22:03:46 +0000
Subject: [PATCH 094/111] small refine of docblock

---
 include/mxnet/lib_api.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 56d94911f699..1c4d069ad16f 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -39,7 +39,7 @@
 #define MX_LIBRARY_VERSION 1
 
 /*!
- * \brief External Tensor data types
+ * \brief Tensor data type, consistent with mshadow data type
  */
 enum MXDType {
   kFloat32 = 0,
@@ -57,7 +57,7 @@ enum MXReturnValue {
 };
 
 /*!
- * \brief External Tensor data structure
+ * \brief Tensor data structure used by custom operator
  */
 struct MXTensor {
   MXTensor() : data_ptr(NULL) {}
@@ -101,7 +101,7 @@ struct MXTensor {
 typedef void* (*xpu_malloc_t)(void*, int);
 
 /*!
- * \brief Class to provide resource APIs to Forward/Backward functions
+ * \brief provide resource APIs memory allocation mechanism to Forward/Backward functions
  */
 class OpResource {
  public:
@@ -447,9 +447,7 @@ class Registry {
 #define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
     Registry<CustomOp>::get()->add(TOSTRING(Name))
 
-/*
- * -------------- BELOW FUNCTIONS ARE USED IN MXNET BACKEND ---------------
- */
+/* -------------- BELOW ARE CTYPE FUNCTIONS PROTOTYPES --------------- */
 
 /*!
  * \brief Following are the C type APIs implemented in the external library
@@ -509,7 +507,7 @@ typedef int (*opVersion_t)();
 
 extern "C" {
   /*! \brief returns MXNet library version */
-  #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
   int

From f16942cf856abaf849f94fc4bb640aa307ffbf5f Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Tue, 8 Oct 2019 21:49:13 +0000
Subject: [PATCH 095/111] change names

---
 .../extensions/lib_custom_op/subgraph_lib.cc  | 10 +-
 include/mxnet/lib_api.h                       | 92 +++++++++----------
 2 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/example/extensions/lib_custom_op/subgraph_lib.cc b/example/extensions/lib_custom_op/subgraph_lib.cc
index a6e4cfc727e4..8e7e8833745a 100644
--- a/example/extensions/lib_custom_op/subgraph_lib.cc
+++ b/example/extensions/lib_custom_op/subgraph_lib.cc
@@ -32,14 +32,14 @@ MXReturnValue parseAttrs(std::map<std::string, std::string> attrs,
   *num_out = 1;
   if (attrs.count(SUBGRAPH_SYM_JSON)) {
     // example of subgraph json parsing
-    Json_Parser jp;
-    json_val val = jp.parse_to_json(attrs[SUBGRAPH_SYM_JSON]);
+    JsonParser jp;
+    JsonVal val = jp.parse_to_json(attrs[SUBGRAPH_SYM_JSON]);
     int input = 0;
-    for (auto &item : val.map[json_val("nodes")].list) {
-      if (item.map[json_val("op")].str == "null")
+    for (auto &item : val.map[JsonVal("nodes")].list) {
+      if (item.map[JsonVal("op")].str == "null")
         input++;
     }
-    int output = val.map[json_val("heads")].list.size();
+    int output = val.map[JsonVal("heads")].list.size();
     *num_in = input;
     *num_out = output;
   }
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 1c4d069ad16f..c1ae74179f3c 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -82,7 +82,6 @@ struct MXTensor {
 
   // data is flatten 1D repr of tensor, elements are in continuous memory
   // user can access each element using the shape of tensor
-  // it may also point to data allocated on gpu
   void *data_ptr;
 
   // shape is in [2,3,4] format to represent high-dim tensor
@@ -90,9 +89,6 @@ struct MXTensor {
 
   // type can only be MXDType enum types
   MXDType dtype;
-
-  // gpu flag to specify the data tensor storage location
-  bool is_gpu;
 };
 
 /*!
@@ -105,16 +101,16 @@ typedef void* (*xpu_malloc_t)(void*, int);
  */
 class OpResource {
  public:
-  OpResource(xpu_malloc_t xm, void* _xm) : xpu_malloc(xm), _xpu_malloc(_xm) {}
+  OpResource(xpu_malloc_t cm, void* ca) : cpu_malloc(cm), cpu_alloc(ca) {}
 
   /*! \brief allocate memory controlled by MXNet */
   void* alloc(int size) {
-    return xpu_malloc(_xpu_malloc, size);
+    return cpu_malloc(cpu_alloc, size);
   }
 
  private:
-  xpu_malloc_t xpu_malloc;
-  void* _xpu_malloc;
+  xpu_malloc_t cpu_malloc;
+  void* cpu_alloc;
 };
 
 /*!
@@ -124,20 +120,20 @@ class OpResource {
 #define SUBGRAPH_SYM_JSON "subgraph_sym_json"
 
 /*! \brief Types of JSON objects */
-enum json_type {ERR, STR, NUM, LIST, MAP};
+enum JsonType {ERR, STR, NUM, LIST, MAP};
 
 /*! \brief definition of JSON objects */
-struct json_val {
-  json_val() : type(ERR), num(-1), str("") {}  // default constructor
+struct JsonVal {
+  JsonVal() : type(ERR), num(-1), str("") {}  // default constructor
   // construct a JSON object by type
-  explicit json_val(json_type t) : type(t), num(-1), str("") {}
+  explicit JsonVal(JsonType t) : type(t), num(-1), str("") {}
   // construct a string JSON object
-  explicit json_val(std::string s) : type(STR), num(-1), str(s) {}
+  explicit JsonVal(std::string s) : type(STR), num(-1), str(s) {}
   // construct a number JSON object
-  explicit json_val(int n) : type(NUM), num(n), str(std::to_string(n)) {}
+  explicit JsonVal(int n) : type(NUM), num(n), str(std::to_string(n)) {}
   // complex constructor
-  json_val(json_type t, int n, std::string s) : type(t), num(n), str(s) {}
-  bool operator<(const json_val &o) const {
+  JsonVal(JsonType t, int n, std::string s) : type(t), num(n), str(s) {}
+  bool operator<(const JsonVal &o) const {
     // for string JSON objects compare the string
     if (type == STR) return type == o.type && str < o.str;
     // for number JSON objects compare the number
@@ -162,24 +158,24 @@ struct json_val {
     }
     return type < o.type;
   }
-  json_type type;
+  JsonType type;
   int num;
   std::string str;
-  std::vector<json_val> list;
-  std::map<json_val, json_val> map;
+  std::vector<JsonVal> list;
+  std::map<JsonVal, JsonVal> map;
 };
 
 /*! \brief functions used for parsing JSON */
-struct Json_Parser {
-  json_val parse_to_json(std::string json) {
+struct JsonParser {
+  JsonVal parse_to_json(std::string json) {
     unsigned int idx = 0;
     return parse(json, &idx);
   }
-  void print_json_val(json_val val) {
+  void print_json_val(JsonVal val) {
     std::cout << json_val_string(val) << std::endl;
   }
   // debug function to convert a JSON object to a string
-  std::string json_val_string(const json_val &val) {
+  std::string json_val_string(const JsonVal &val) {
     std::string ret;
     switch (val.type) {
     case ERR:
@@ -207,8 +203,8 @@ struct Json_Parser {
     return ret;
   }
   // parse a string JSON object
-  json_val parse_string(std::string json, unsigned int* idx) {
-    json_val ret(STR);
+  JsonVal parse_string(std::string json, unsigned int* idx) {
+    JsonVal ret(STR);
     while (*idx < json.size()) {
       if (json[*idx] == '"') {
         ++(*idx);
@@ -219,11 +215,11 @@ struct Json_Parser {
       }
     }
     std::cout << "Error! Unable to parse string" << std::endl;
-    return json_val();
+    return JsonVal();
   }
   // parse a number JSON object
-  json_val parse_num(std::string json, unsigned int* idx) {
-    json_val ret(NUM);
+  JsonVal parse_num(std::string json, unsigned int* idx) {
+    JsonVal ret(NUM);
     while (*idx < json.size()) {
       if (json[*idx] >= '0' && json[*idx] <= '9') {
         ret.str += json[*idx];
@@ -236,30 +232,30 @@ struct Json_Parser {
     return ret;
   }
   // parse a list of JSON objects
-  json_val parse_list(std::string json, unsigned int* idx) {
-    json_val ret(LIST);
+  JsonVal parse_list(std::string json, unsigned int* idx) {
+    JsonVal ret(LIST);
     while (*idx < json.size()) {
       if (json[*idx] == ']') {
         ++(*idx);
         return ret;
       } else {
-        json_val item = parse(json, idx);
+        JsonVal item = parse(json, idx);
         if (item.type != ERR)
           ret.list.push_back(item);
       }
     }
     std::cout << "Error! Unable to parse list" << std::endl;
-    return json_val();
+    return JsonVal();
   }
   // parse a map of JSON objects
-  json_val parse_map(std::string json, unsigned int* idx) {
-    json_val ret(MAP), key;
+  JsonVal parse_map(std::string json, unsigned int* idx) {
+    JsonVal ret(MAP), key;
     while (*idx < json.size()) {
       if (json[*idx] == '}') {
         ++(*idx);
         return ret;
       } else {
-        json_val item = parse(json, idx);
+        JsonVal item = parse(json, idx);
         if (key.type == ERR) {
           key = item;
         } else {
@@ -269,11 +265,11 @@ struct Json_Parser {
       }
     }
     std::cout << "Error! Unable to parse map" << std::endl;
-    return json_val();
+    return JsonVal();
   }
   // generic parse function
-  json_val parse(std::string json, unsigned int *idx) {
-    json_val ret;
+  JsonVal parse(std::string json, unsigned int *idx) {
+    JsonVal ret;
     while (*idx < json.size()) {
       if (json[*idx] == '"') {
         ++(*idx);
@@ -432,20 +428,20 @@ class Registry {
  * Annoyingly, the concat_ and concat macros are necessary to
  * be able to use __COUNTER__ in an identifier name 
  */
-#define _STR_CONCAT_(__a, __b) __a ## __b
-#define _STR_CONCAT(__a, __b) _STR_CONCAT_(__a, __b)
+#define MX_STR_CONCAT_(__a, __b) __a ## __b
+#define MX_STR_CONCAT(__a, __b) MX_STR_CONCAT_(__a, __b)
 
 /*! \brief convert a token to a string */
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
+#define MX_STRINGIFY(x) #x
+#define MX_TOSTRING(x) MX_STRINGIFY(x)
 
 /*! \brief declare a variable with custom name */
-#define _REGISTER_NAME_(Name) MXNet ## _CustomOp ## _
-#define _REGISTER_DEF_(Name) CustomOp _REGISTER_NAME_(Name)
+#define MX_REGISTER_NAME_(Name) MXNet ## _CustomOp ## _
+#define MX_REGISTER_DEF_(Name) CustomOp MX_REGISTER_NAME_(Name)
 
 /*! \brief assign a var to a value */
-#define REGISTER_OP(Name) _STR_CONCAT(_REGISTER_DEF_(Name), __COUNTER__) = \
-    Registry<CustomOp>::get()->add(TOSTRING(Name))
+#define REGISTER_OP(Name) MX_STR_CONCAT(MX_REGISTER_DEF_(Name), __COUNTER__) = \
+    Registry<CustomOp>::get()->add(MX_TOSTRING(Name))
 
 /* -------------- BELOW ARE CTYPE FUNCTIONS PROTOTYPES --------------- */
 
@@ -670,7 +666,7 @@ extern "C" {
                   void** indata, int* intypes, int num_in,
                   const int64_t** outshapes, int* outdims,
                   void** outdata, int* outtypes, int num_out,
-                  xpu_malloc_t xpu_malloc, void* _xpu_malloc) {
+                  xpu_malloc_t cpu_malloc, void* cpu_alloc) {
     // create map of attributes from list
     std::map<std::string, std::string> attrs;
     for (int i = 0; i < num; i++) {
@@ -697,7 +693,7 @@ extern "C" {
       }
     }
 
-    OpResource res(xpu_malloc, _xpu_malloc);
+    OpResource res(cpu_malloc, cpu_alloc);
 
     return fcomp(attrs, inputs, outputs, res);
   }

From 50a6b6425707349726ede08944594c8ebd2b2151 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Tue, 8 Oct 2019 22:57:52 +0000
Subject: [PATCH 096/111] add separate stateful compute and pass state_op ptr

---
 include/mxnet/lib_api.h | 46 +++++++++++++++++++++++++++-
 src/c_api/c_api.cc      | 66 ++++++++++++++++++++++-------------------
 2 files changed, 80 insertions(+), 32 deletions(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index c1ae74179f3c..983f9ab94a0d 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -495,6 +495,11 @@ typedef int (*opCallMutateInputs_t)(mutateInputs_t, const char* const*, const ch
 typedef int (*opCallCreateOpState_t)(createOpState_t, const char* const*, const char* const*, int,
                                      void**);
 
+#define MXLIB_OPCALLFSTATEFULCOMP_STR "_opCallFStatefulCompute"
+typedef int (*opCallFStatefulComp_t)(bool, void*, const int64_t**, int*, void**, int*, int,
+                                     const int64_t**, int*, void**, int*, int,
+                                     xpu_malloc_t, void*);
+
 #define MXLIB_INITIALIZE_STR "initialize"
 typedef int (*initialize_t)(int);
 
@@ -654,7 +659,7 @@ extern "C" {
     return retval;
   }
 
-  /*! \brief returns status of calling Forward function for operator from library */
+  /*! \brief returns status of calling Forward/Backward function for operator from library */
 #if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
   __declspec(dllexport) int __cdecl
 #else
@@ -750,6 +755,45 @@ extern "C" {
     return create_op(attrs, op_ptr);
   }
 
+  /*! \brief returns status of calling Stateful Forward/Backward for operator from library */
+#if defined(_WIN32) || defined(_WIN64) || defined(__WINDOWS__)
+  __declspec(dllexport) int __cdecl
+#else
+  int
+#endif
+  _opCallFStatefulCompute(bool is_forward, void* state_op,
+                          const int64_t** inshapes, int* indims,
+                          void** indata, int* intypes, int num_in,
+                          const int64_t** outshapes, int* outdims,
+                          void** outdata, int* outtypes, int num_out,
+                          xpu_malloc_t cpu_malloc, void* cpu_alloc) {
+    // create a vector of tensors for inputs
+    std::vector<MXTensor> inputs(num_in);
+    for (int i = 0; i < num_in; i++) {
+      inputs[i].data_ptr = indata[i];
+      inputs[i].dtype = (MXDType)intypes[i];
+      for (int j = 0; j < indims[i]; j++) {
+        inputs[i].shape.push_back(inshapes[i][j]);
+      }
+    }
+
+    // create a vector of tensors for outputs
+    std::vector<MXTensor> outputs(num_out);
+    for (int i = 0; i < num_out; i++) {
+      outputs[i].data_ptr = outdata[i];
+      outputs[i].dtype = (MXDType) outtypes[i];
+      for (int j = 0; j < outdims[i]; j++) {
+        outputs[i].shape.push_back(outshapes[i][j]);
+      }
+    }
+    OpResource res(cpu_malloc, cpu_alloc);
+    CustomStatefulOp* op_ptr = reinterpret_cast<CustomStatefulOp*>(state_op);
+    if (is_forward) {
+      return op_ptr->Forward(inputs, outputs, res);
+    }
+    return op_ptr->Backward(inputs, outputs, res);
+  }
+
   /*!
    * \brief Checks if the MXNet version is supported by the library.
    * If supported, initializes the library.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 129fd599d297..8cf40fee1898 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -138,13 +138,16 @@ int MXLoadLib(const char *path) {
   opCallCreateOpState_t callCreateOpState =
     get_func<opCallCreateOpState_t>(lib, const_cast<char*>(MXLIB_OPCALLCREATEOPSTATE_STR));
 
+  opCallFStatefulComp_t callFStatefulComp =
+    get_func<opCallFStatefulComp_t>(lib, const_cast<char*>(MXLIB_OPCALLFSTATEFULCOMP_STR));
+
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
   int numOps = opRegSize();
   LOG(INFO) << "Found " << numOps << " operators in library";
 
   /*
-   * The library has custom operators implementation
+   * Get all custom operators implementation from custom library
    * loop and register each operator in the library to NNVM
    */
   opRegGet_t opRegGet = get_func<opRegGet_t>(lib, const_cast<char*>(MXLIB_OPREGGET_STR));
@@ -368,11 +371,11 @@ int MXLoadLib(const char *path) {
 
     // lambda function to convert from external fcompute to internal MXNet types
     auto fcomp_lambda = [=](fcomp_t fcomp_fp,
-                     const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<NDArray>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<NDArray>& outputs) {
+                            const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs) {
       // convert attributes to vector of char*
       std::vector<const char*> attr_keys, attr_vals;
       for (auto kv : attrs.dict) {
@@ -406,6 +409,7 @@ int MXLoadLib(const char *path) {
       mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
 
       // create lambda that captures stream & resource objects
+      // the memory pointer returned will eventually return to user
       auto cpu_alloc = [&](int size) {
         mshadow::Tensor<mxnet::cpu, 1, char> data =
         resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
@@ -418,6 +422,7 @@ int MXLoadLib(const char *path) {
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
         alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
+        // call cpu_alloc to actually allocate memory and get the pointer
         void* ptr = (*cpualloc)(size);
         return ptr;
       };
@@ -547,30 +552,31 @@ int MXLoadLib(const char *path) {
     };
 
     // stateful forward and backward
-    auto fstateful_lambda = [=](bool forward,
+    auto fstateful_lambda = [=](bool is_forward,
                                 const OpStatePtr& state_ptr,
                                 const OpContext& ctx,
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-      // create a vector of tensors for inputs
-      std::vector<MXTensor> c_inputs(inputs.size());
+      std::vector<void*> in_data, out_data;
+      std::vector<const int64_t *> in_shapes, out_shapes;
+      std::vector<int> in_dims, out_dims;
+      std::vector<int> in_types, out_types;
+
+      // convert input tensors to constituent parts
       for (size_t i = 0; i < inputs.size(); i++) {
-        c_inputs[i].data_ptr = inputs[i].data().dptr_;
-        c_inputs[i].dtype = (MXDType)inputs[i].dtype();
-        for (int_least16_t j = 0; j < inputs[i].shape().ndim(); j++) {
-          c_inputs[i].shape.push_back(inputs[i].shape().data()[j]);
-        }
+        in_data.push_back(inputs[i].data().dptr_);
+        in_shapes.push_back(inputs[i].shape().data());
+        in_dims.push_back(inputs[i].shape().ndim());
+        in_types.push_back(inputs[i].dtype());
       }
 
-      // create a vector of tensors for outputs
-      std::vector<MXTensor> c_outputs(outputs.size());
+      // convert output tensors to constituent parts
       for (size_t i = 0; i < outputs.size(); i++) {
-        c_outputs[i].data_ptr = outputs[i].data().dptr_;
-        c_outputs[i].dtype = (MXDType)outputs[i].dtype();
-        for (int j = 0; j < outputs[i].shape().ndim(); j++) {
-          c_outputs[i].shape.push_back(outputs[i].shape().data()[j]);
-        }
+        out_data.push_back(outputs[i].data().dptr_);
+        out_shapes.push_back(outputs[i].shape().data());
+        out_dims.push_back(outputs[i].shape().ndim());
+        out_types.push_back(outputs[i].dtype());
       }
 
       // get memory resource
@@ -578,6 +584,7 @@ int MXLoadLib(const char *path) {
       mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
 
       // create lambda that captures stream & resource objects
+      // the memory pointer returned will eventually return to user
       auto cpu_alloc = [&](int size) {
         mshadow::Tensor<mxnet::cpu, 1, char> data =
         resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
@@ -590,26 +597,23 @@ int MXLoadLib(const char *path) {
       auto cpu_malloc = [](void* _cpu_alloc, int size) {
         // cast the void* argument to the type for the cpu_alloc lambda function
         alloc_type* cpualloc = static_cast<alloc_type*>(_cpu_alloc);
+        // call cpu_alloc to actually allocate memory and get the pointer
         void* ptr = (*cpualloc)(size);
         return ptr;
       };
 
-      OpResource op_res(cpu_malloc, &cpu_alloc);
-
       // retrieve op state object created from CreateOpState
       CustomStatefulOpWrapper& op = state_ptr.get_state<CustomStatefulOpWrapper>();
       CustomStatefulOp* state_op_inst = op.get_instance();
       CHECK(state_op_inst != nullptr)
       << "Error MXNet cannot load custom stateful operator'" << name_str << "'";
 
-      if (forward) {
-        CHECK(state_op_inst->Forward(c_inputs, c_outputs, op_res))
-        << "Error calling ForwardStateful for custom operator '" << name_str << "'";
-      } else {
-        CHECK(state_op_inst->Backward(c_inputs, c_outputs, op_res))
-        << "Error calling BackwardStateful for custom operator '" << name_str << "'";
-      }
-      // return type void
+      // call fcompute function
+      CHECK(callFStatefulComp(is_forward, state_op_inst, in_shapes.data(), in_dims.data(),
+                              in_data.data(), in_types.data(), in_data.size(),
+                              out_shapes.data(), out_dims.data(), out_data.data(),
+                              out_types.data(), out_data.size(), cpu_malloc, &cpu_alloc))
+      << "Error calling FStatefulCompute for custom operator '" << name_str << "'";
     };
 
     auto fstateful_forward = [=](const OpStatePtr& state_ptr,

From adb0415f36b2afd6b47a5773721c43a02d51814e Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Mon, 14 Oct 2019 07:38:24 +0000
Subject: [PATCH 097/111] user example using opresource alloc

---
 example/extensions/lib_custom_op/gemm_lib.cc | 27 ++++++++------------
 src/c_api/c_api.cc                           |  8 +++---
 2 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 1452b5289119..5e7de5934af8 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -99,17 +99,17 @@ MXReturnValue backward(std::map<std::string, std::string> attrs,
   unsigned n = inputs[1].shape[0];
   unsigned k = inputs[1].shape[1];
   unsigned m = inputs[2].shape[1];
-
-  float *At = new float[k*n];
-  float *Bt = new float[m*k];
+  // allocate temporary workspace memory through resource manager
+  // for multiple arrays better to request a big memory pool
+  void *workspace = res.alloc((k*n + m*k) * sizeof(float));
+  float *At = static_cast<float*>(workspace);
+  float *Bt = static_cast<float*>(workspace) + (k*n);
 
   transpose(A, At, k, n);
   transpose(B, Bt, m, k);
   gemm(dC, Bt, dA, n, m, k);
   gemm(At, dC, dB, k, n, m);
 
-  delete[] At;
-  delete[] Bt;
   return MX_SUCCESS;
 }
 
@@ -146,12 +146,8 @@ MXReturnValue inferShape(std::map<std::string, std::string> attrs,
     std::cout << "Expected 2 inputs to inferShape" << std::endl;
     return MX_FAIL;
   }
-  if (inshapes[0].size() != 2) {
-    std::cout << "Expected 2D for first input to inferShape" << std::endl;
-    return MX_FAIL;
-  }
-  if (inshapes[1].size() != 2) {
-    std::cout << "Expected 2D for second input to inferShape" << std::endl;
+  if (inshapes[0].size() != 2 || inshapes[1].size() != 2) {
+    std::cout << "Expected 2D matrices for both inputs to inferShape" << std::endl;
     return MX_FAIL;
   }
 
@@ -164,8 +160,7 @@ MXReturnValue inferShape(std::map<std::string, std::string> attrs,
     return MX_FAIL;
   }
 
-  outshapes[0].push_back(n);
-  outshapes[0].push_back(m);
+  outshapes[0] = {n, m};
   return MX_SUCCESS;
 }
 
@@ -185,10 +180,8 @@ class MyStatefulGemm : public CustomStatefulOp {
   MXReturnValue Forward(std::vector<MXTensor> inputs,
                         std::vector<MXTensor> outputs,
                         OpResource op_res) {
-    int* p = static_cast<int*>(op_res.alloc(sizeof(int)));
-    *p = ++count;
-    std::cout << "Info: cpu malloc test: keyword + number of forward: " << *p << std::endl;
-
+    ++count;
+    std::cout << "Info: keyword + number of forward: " << count << std::endl;
     std::map<std::string, std::string> attrs;
     return forward(attrs, inputs, outputs, op_res);
   }
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 8cf40fee1898..482c8dfb8f47 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -409,11 +409,11 @@ int MXLoadLib(const char *path) {
       mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
 
       // create lambda that captures stream & resource objects
-      // the memory pointer returned will eventually return to user
+      // this temp workspace holds memory allocated by custom library via OpResource
       auto cpu_alloc = [&](int size) {
-        mshadow::Tensor<mxnet::cpu, 1, char> data =
-        resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
-        return data.dptr_;
+        mshadow::Tensor<mxnet::cpu, 1, char> workspace =
+          resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);
+        return workspace.dptr_;
       };
 
       // create lambda without captures so that we can cast it to function pointer

From dfb59466fdb93e52faaf9e6161994d03f677de57 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 01:53:29 +0000
Subject: [PATCH 098/111] added DLTensor into MXTensor

---
 include/mxnet/lib_api.h | 192 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 191 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 983f9ab94a0d..b1fba2c5c3c3 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -38,6 +38,154 @@
 
 #define MX_LIBRARY_VERSION 1
 
+/*
+ * Import from DLPack https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
+ */
+#ifndef DLPACK_VERSION
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 020
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+  #endif
+  /*!
+   * \brief The device type in DLContext.
+   */
+  typedef enum {
+    /*! \brief CPU device */
+    kDLCPU = 1,
+    /*! \brief CUDA GPU device */
+    kDLGPU = 2,
+    /*!
+     * \brief Pinned CUDA GPU device by cudaMallocHost
+     * \note kDLCPUPinned = kDLCPU | kDLGPU
+     */
+    kDLCPUPinned = 3,
+    /*! \brief OpenCL devices. */
+    kDLOpenCL = 4,
+    /*! \brief Vulkan buffer for next generation graphics. */
+    kDLVulkan = 7,
+    /*! \brief Metal for Apple GPU. */
+    kDLMetal = 8,
+    /*! \brief Verilog simulator buffer */
+    kDLVPI = 9,
+    /*! \brief ROCm GPUs for AMD GPUs */
+    kDLROCM = 10,
+    /*!
+     * \brief Reserved extension device type,
+     * used for quickly test extension device
+     * The semantics can differ depending on the implementation.
+     */
+    kDLExtDev = 12,
+  } DLDeviceType;
+
+  /*!
+   * \brief A Device context for Tensor and operator.
+   */
+  typedef struct {
+    /*! \brief The device type used in the device. */
+    DLDeviceType device_type;
+    /*! \brief The device index */
+    int device_id;
+  } DLContext;
+
+  /*!
+   * \brief The type code options DLDataType.
+   */
+  typedef enum {
+    kDLInt = 0U,
+    kDLUInt = 1U,
+    kDLFloat = 2U,
+  } DLDataTypeCode;
+
+  /*!
+   * \brief The data type the tensor can hold.
+   *
+   *  Examples
+   *   - float: type_code = 2, bits = 32, lanes=1
+   *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+   *   - int8: type_code = 0, bits = 8, lanes=1
+   */
+  typedef struct {
+    /*!
+     * \brief Type code of base types.
+     * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+     * footprint, but the value should be one of DLDataTypeCode enum values.
+     * */
+    uint8_t code;
+    /*!
+     * \brief Number of bits, common choices are 8, 16, 32.
+     */
+    uint8_t bits;
+    /*! \brief Number of lanes in the type, used for vector types. */
+    uint16_t lanes;
+  } DLDataType;
+
+  /*!
+   * \brief Plain C Tensor object, does not manage memory.
+   */
+  typedef struct {
+    /*!
+     * \brief The opaque data pointer points to the allocated data. This will be
+     * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+     * aligns to 256 bytes as in CUDA.
+     *
+     * For given DLTensor, the size of memory required to store the contents of
+     * data is calculated as follows:
+     *
+     * \code{.c}
+     * static inline size_t GetDataSize(const DLTensor* t) {
+     *   size_t size = 1;
+     *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+     *     size *= t->shape[i];
+     *   }
+     *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+     *   return size;
+     * }
+     * \endcode
+     */
+    void* data;
+    /*! \brief The device context of the tensor */
+    DLContext ctx;
+    /*! \brief Number of dimensions */
+    int ndim;
+    /*! \brief The data type of the pointer*/
+    DLDataType dtype;
+    /*! \brief The shape of the tensor */
+    int64_t* shape;
+    /*!
+     * \brief strides of the tensor (in number of elements, not bytes)
+     *  can be NULL, indicating tensor is compact and row-majored.
+     */
+    int64_t* strides;
+    /*! \brief The offset in bytes to the beginning pointer to data */
+    uint64_t byte_offset;
+  } DLTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif
+
 /*!
  * \brief Tensor data type, consistent with mshadow data type
  */
@@ -63,7 +211,46 @@ struct MXTensor {
   MXTensor() : data_ptr(NULL) {}
 
   MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype) {}
+  : data_ptr(data_ptr), shape(shape), dtype(dtype) {
+    dltensor.data = data_ptr;
+    dltensor.ctx.device_type = kDLCPU;
+    dltensor.ctx.device_id = 0;
+    dltensor.ndim = shape.size();
+    dltensor.shape = const_cast<int64_t*>(shape.data());
+    dltensor.strides = NULL;
+    dltensor.byte_offset = 0;
+    dltensor.dtype.lanes = 1;
+    switch(dtype) {
+    case kFloat32:
+      dltensor.dtype.code = kDLFloat;
+      dltensor.dtype.bits = 32;
+      break;
+    case kFloat64:
+      dltensor.dtype.code = kDLFloat;
+      dltensor.dtype.bits = 64;
+      break;
+    case kFloat16:
+      dltensor.dtype.code = kDLFloat;
+      dltensor.dtype.bits = 16;
+      break;
+    case kUint8:
+      dltensor.dtype.code = kDLUInt;
+      dltensor.dtype.bits = 8;
+      break;
+    case kInt32:
+      dltensor.dtype.code = kDLInt;
+      dltensor.dtype.bits = 32;
+      break;
+    case kInt8:
+      dltensor.dtype.code = kDLInt;
+      dltensor.dtype.bits = 8;
+      break;
+    case kInt64:
+      dltensor.dtype.code = kDLInt;
+      dltensor.dtype.bits = 64;
+      break;
+    }
+  }
 
   /*! \brief helper function to cast data pointer */
   template<typename data_type>
@@ -89,6 +276,9 @@ struct MXTensor {
 
   // type can only be MXDType enum types
   MXDType dtype;
+
+  /*! \brief corresponding DLTensor of this MXTensor */
+  DLTensor dltensor;
 };
 
 /*!

From 5146fd5a9639e95598cb4ce49a53da02d58b7379 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 02:10:26 +0000
Subject: [PATCH 099/111] fixed whitespace

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b1fba2c5c3c3..b110cb123c2a 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -220,7 +220,7 @@ struct MXTensor {
     dltensor.strides = NULL;
     dltensor.byte_offset = 0;
     dltensor.dtype.lanes = 1;
-    switch(dtype) {
+    switch (dtype) {
     case kFloat32:
       dltensor.dtype.code = kDLFloat;
       dltensor.dtype.bits = 32;

From 1b9fee289bc194c610e164a8587e1b079bf71a04 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 03:09:15 +0000
Subject: [PATCH 100/111] added error check when DLTensor does not support
 MXNet data type

---
 include/mxnet/lib_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index b110cb123c2a..dc8aa3fbfea6 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -249,6 +249,10 @@ struct MXTensor {
       dltensor.dtype.code = kDLInt;
       dltensor.dtype.bits = 64;
       break;
+    default:
+      dltensor.dtype.code = 0;
+      dltensor.dtype.bits = 0;
+      std::cout << "Error! Invalid dtype flag: " << dtype " when constructing MXTensor" << std::endl;  
     }
   }
 

From 5761891d039c49e34ab1da86ed802239d83e545d Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 04:46:17 +0000
Subject: [PATCH 101/111] changed to throw runtime exception

---
 include/mxnet/lib_api.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index dc8aa3fbfea6..cf04e938d762 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -35,6 +35,7 @@
 #include <string>
 #include <iostream>
 #include <utility>
+#include <exception>
 
 #define MX_LIBRARY_VERSION 1
 
@@ -252,7 +253,9 @@ struct MXTensor {
     default:
       dltensor.dtype.code = 0;
       dltensor.dtype.bits = 0;
-      std::cout << "Error! Invalid dtype flag: " << dtype " when constructing MXTensor" << std::endl;  
+      throw std::runtime_error("Error! Invalid dtype flag: "
+                               + std::to_string(static_cast<int>(dtype))
+                               + " when constructing MXTensor");
     }
   }
 

From ef840b4a59f860baebb24b3b5970858bf422e635 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 05:12:07 +0000
Subject: [PATCH 102/111] changed include to stdexcept

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index cf04e938d762..6045653eb4ff 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -35,7 +35,7 @@
 #include <string>
 #include <iostream>
 #include <utility>
-#include <exception>
+#include <stdexcept>
 
 #define MX_LIBRARY_VERSION 1
 

From bba61b36f0dcfa6e4c3b334903843439c30ec498 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Fri, 18 Oct 2019 13:44:26 +0800
Subject: [PATCH 103/111] retrigger CI


From 53d18ecfdcf916f5c43ff9e9a468cd6bc8f2cc75 Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 07:38:18 +0000
Subject: [PATCH 104/111] empty commit


From 141328f133d34bbf42595c86e942c059f5a3d58a Mon Sep 17 00:00:00 2001
From: Sam Skalicky <sskalic@amazon.com>
Date: Fri, 18 Oct 2019 09:41:55 +0000
Subject: [PATCH 105/111] empty commit


From ed8ac16c95e4d67ed55a41fb71f6e921fface159 Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 23 Oct 2019 05:30:07 +0000
Subject: [PATCH 106/111] remove merge conflict

---
 tests/python/gpu/test_operator_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 08289a3c18aa..22510f5e68b7 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -48,7 +48,6 @@
 from test_contrib_operator import test_multibox_target_op
 from test_tvm_op import *
 from test_extensions import *
-from test_library_loading import *
 from test_contrib_optimizer import test_adamw
 
 set_default_context(mx.gpu(0))

From 56b0e28cee9416a40fe83ab758767caecedbc95a Mon Sep 17 00:00:00 2001
From: Ziyi Mu <ziyi.mu@columbia.edu>
Date: Wed, 23 Oct 2019 06:52:28 +0000
Subject: [PATCH 107/111] add setdltensor for easy use and add docs

---
 example/extensions/lib_custom_op/gemm_lib.cc |  2 ++
 include/mxnet/lib_api.h                      | 12 ++++++++++--
 src/c_api/c_api.cc                           |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
index 5e7de5934af8..3835207e2a16 100644
--- a/example/extensions/lib_custom_op/gemm_lib.cc
+++ b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -61,6 +61,8 @@ MXReturnValue forward(std::map<std::string, std::string> attrs,
   if (inputs[0].dtype == kFloat32) {
     typedef float DType;
     // extract data pointers from tensors
+    // if using dltensor repr, below lines can be changed to something like
+    // DType* A = reinterpret_cast<DType*>(inputs[0].dltensor.data);
     DType* A = inputs[0].data<DType>();
     DType* B = inputs[1].data<DType>();
     DType* C = outputs[0].data<DType>();
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 6045653eb4ff..290a63518373 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -212,7 +212,10 @@ struct MXTensor {
   MXTensor() : data_ptr(NULL) {}
 
   MXTensor(void *data_ptr, const std::vector<int64_t> &shape, MXDType dtype)
-  : data_ptr(data_ptr), shape(shape), dtype(dtype) {
+  : data_ptr(data_ptr), shape(shape), dtype(dtype) {}
+
+  /*! \brief populate DLTensor fields */
+  void setDLTensor() {
     dltensor.data = data_ptr;
     dltensor.ctx.device_type = kDLCPU;
     dltensor.ctx.device_id = 0;
@@ -284,7 +287,8 @@ struct MXTensor {
   // type can only be MXDType enum types
   MXDType dtype;
 
-  /*! \brief corresponding DLTensor of this MXTensor */
+  // corresponding DLTensor repr of MXTensor
+  // easy way to reuse functions taking DLTensor
   DLTensor dltensor;
 };
 
@@ -883,6 +887,7 @@ extern "C" {
       for (int j = 0; j < indims[i]; j++) {
         inputs[i].shape.push_back(inshapes[i][j]);
       }
+      inputs[i].setDLTensor();
     }
 
     // create a vector of tensors for outputs
@@ -893,6 +898,7 @@ extern "C" {
       for (int j = 0; j < outdims[i]; j++) {
         outputs[i].shape.push_back(outshapes[i][j]);
       }
+      outputs[i].setDLTensor();
     }
 
     OpResource res(cpu_malloc, cpu_alloc);
@@ -972,6 +978,7 @@ extern "C" {
       for (int j = 0; j < indims[i]; j++) {
         inputs[i].shape.push_back(inshapes[i][j]);
       }
+      inputs[i].setDLTensor();
     }
 
     // create a vector of tensors for outputs
@@ -982,6 +989,7 @@ extern "C" {
       for (int j = 0; j < outdims[i]; j++) {
         outputs[i].shape.push_back(outshapes[i][j]);
       }
+      outputs[i].setDLTensor();
     }
     OpResource res(cpu_malloc, cpu_alloc);
     CustomStatefulOp* op_ptr = reinterpret_cast<CustomStatefulOp*>(state_op);
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 482c8dfb8f47..0efb0d48fabb 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -584,7 +584,7 @@ int MXLoadLib(const char *path) {
       mshadow::Stream<mxnet::cpu> *cpu_stream = ctx.get_stream<mxnet::cpu>();
 
       // create lambda that captures stream & resource objects
-      // the memory pointer returned will eventually return to user
+      // this temp workspace holds memory allocated by custom library via OpResource
       auto cpu_alloc = [&](int size) {
         mshadow::Tensor<mxnet::cpu, 1, char> data =
         resource.get_space_typed<mxnet::cpu, 1, char>(mshadow::Shape1(size), cpu_stream);

From 50c8aeadbe08214e02b0979381cb01ebb9f992a1 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Wed, 27 Nov 2019 07:38:08 +0800
Subject: [PATCH 108/111] CI


From 34a9ee9462c48529819a5b4ae38196f293715222 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Thu, 28 Nov 2019 14:13:33 +0800
Subject: [PATCH 109/111] re-trigger CI


From 9910c397081ff883b4959b0e58390e02e8aab39f Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Thu, 5 Dec 2019 14:26:16 +0800
Subject: [PATCH 110/111] ci


From 5fd43140331c6d6c4f9d646853db2a55103606d2 Mon Sep 17 00:00:00 2001
From: JackieWu <wkcn@live.cn>
Date: Fri, 6 Dec 2019 10:24:24 +0800
Subject: [PATCH 111/111] ci