Merge branch 'master' into row_stack

apache · Dec 27, 2019 · 660421e · 660421e
2 parents e48b8c8 + 1cfaf3c
commit 660421e
Show file tree

Hide file tree

Showing 51 changed files with 3,502 additions and 97 deletions.
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
diff --git a/cd/python/pypi/Jenkins_pipeline.groovy b/cd/python/pypi/Jenkins_pipeline.groovy
@@ -27,7 +27,7 @@
 // This is a temporary solution until we are confident with the packages generated by CI
 // This should be removed in the not too distant future.
 // We only skip the publish step so we can still QA the other variants.
-pypi_releases = ["cu92", "cu92mkl"]
+pypi_releases = []
 
 def get_pipeline(mxnet_variant) {
   def node_type = mxnet_variant.startsWith('cu') ? NODE_LINUX_GPU : NODE_LINUX_CPU
@@ -72,6 +72,7 @@ def push(mxnet_variant) {
     } else {
       echo "Temporarily skipping publishing PyPI package for '${mxnet_variant}'."
     }
+    sh "./ci/docker/runtime_functions.sh cd_s3_publish"
   }
 }
 

diff --git a/cd/python/pypi/pypi_publish.py b/cd/python/pypi/pypi_publish.py
@@ -35,10 +35,8 @@ def post_wheel(path):
     logging.info('Posting {} to PyPI'.format(path))
     pypi_credentials = get_secret()
 
-    cmd = 'python3 -m twine upload --username {} --password {} {}'.format(
-        pypi_credentials['username'],
-        pypi_credentials['password'],
-        path)
+    cmd = 'python3 -m twine upload {}'.format(path)
+    version = os.path.basename(path).split('-')[1]
 
     # The PyPI credentials for DEV has username set to 'skipPublish'
     # This way we do not attempt to publish the PyPI package
@@ -47,14 +45,15 @@ def post_wheel(path):
         print('In DEV account, skipping publish')
         print('Would have run: {}'.format(cmd))
         return 0
-    else:
+    elif any(test_version_mark in version for test_version_mark in ['a', 'b', 'dev']):
         print('Skipping publishing nightly builds to Pypi.')
         print('See https://github.com/pypa/pypi-support/issues/50 for details')
         return 0
-
-        # DO NOT PRINT CMD IN THIS BLOCK, includes password
-        p = subprocess.run(cmd.split(' '),
-                        stdout=subprocess.PIPE)
+    else:
+        env = os.environ.copy()
+        env['TWINE_USERNAME'] = pypi_credentials['username']
+        env['TWINE_PASSWORD'] = pypi_credentials['password']
+        p = subprocess.run(cmd.split(' '), stdout=subprocess.PIPE, env=env)
         logging.info(p.stdout)
         return p.returncode
 
@@ -85,7 +84,7 @@ def get_secret():
             raise e
     else:
         return json.loads(get_secret_value_response['SecretString'])
-        
-            
+
+
 if __name__ == '__main__':
     sys.exit(post_wheel(sys.argv[1]))
diff --git a/ci/docker/install/requirements b/ci/docker/install/requirements
@@ -26,8 +26,8 @@ h5py==2.8.0rc1
 mock==2.0.0
 nose==1.3.7
 nose-timer==0.7.3
-numpy>1.16.0,<2.0.0
+numpy>1.16.0,<1.18.0
 pylint==2.3.1; python_version >= '3.0'
 requests<2.19.0,>=2.18.4
-scipy==1.0.1
+scipy==1.2.1
 six==1.11.0
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -2065,6 +2065,15 @@ cd_pypi_publish() {
     ./cd/python/pypi/pypi_publish.py `readlink -f wheel_build/dist/*.whl`
 }
 
+cd_s3_publish() {
+    set -ex
+    pip3 install --user awscli
+    filepath=$(readlink -f wheel_build/dist/*.whl)
+    filename=$(basename $file_path)
+    variant=$(echo $filename | cut -d'-' -f1 | cut -d'_' -f2 -s)
+    aws s3 cp --grants read=uri=http://acs.amazonaws.com/groups/global/AllUsers,full=id=43f628fab72838a4f0b929d7f1993b14411f4b0294b011261bc6bd3e950a6822 s3://apache-mxnet/dist/${variant}/${filename}
+}
+
 build_static_scala_mkl() {
     set -ex
     pushd .

diff --git a/example/neural_collaborative_filtering/README.md b/example/neural_collaborative_filtering/README.md
@@ -29,15 +29,6 @@ Author: Dr. Xiangnan He (http://www.comp.nus.edu.sg/~xiangnan/)
 
 Code Reference: https://github.com/hexiangnan/neural_collaborative_filtering
 
-## Environment Settings
-We use MXnet with MKL-DNN as the backend. 
-- MXNet version:  '1.5.1'
-
-## Install
-```
-pip install -r requirements.txt
-```
-
 ## Dataset
 
 We provide the processed datasets on [Google Drive](https://drive.google.com/drive/folders/1qACR_Zhc2O2W0RrazzcepM2vJeh0MMdO?usp=sharing): MovieLens 20 Million (ml-20m), you can download directly or 
@@ -66,7 +57,9 @@ We provide the pretrained ml-20m model on [Google Drive](https://drive.google.co
 |dtype|HR@10|NDCG@10|
 |:---:|:--:|:--:|
 |float32|0.6393|0.3849|
-|int8|0.6366|0.3824|
+|float32 opt|0.6393|0.3849|
+|int8|0.6395|0.3852|
+|int8 opt|0.6396|0.3852|
 
 ## Training
 
@@ -75,27 +68,46 @@ We provide the pretrained ml-20m model on [Google Drive](https://drive.google.co
 python train.py # --gpu=0
 ```
 
+## Model Optimizer
+
+```
+# optimize model
+python model_optimizer.py
+```
+
 ## Calibration
 
 ```
 # neumf calibration on ml-20m dataset
 python ncf.py --prefix=./model/ml-20m/neumf --calibration
+# optimized neumf calibration on ml-20m dataset
+python ncf.py --prefix=./model/ml-20m/neumf-opt --calibration
 ```
 
 ## Evaluation
 
 ```
 # neumf float32 inference on ml-20m dataset
 python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf
+# optimized neumf float32 inference on ml-20m dataset
+python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt
 # neumf int8 inference on ml-20m dataset
 python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-quantized
+# optimized neumf int8 inference on ml-20m dataset
+python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt-quantized
 ```
 
 ## Benchmark
 
 ```
+usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]
+
 # neumf float32 benchmark on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf --benchmark
+sh benchmark.sh -p model/ml-20m/neumf
+# optimized neumf float32 benchmark on ml-20m dataset
+sh benchmark.sh -p model/ml-20m/neumf-opt
 # neumf int8 benchmark on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-quantized --benchmark
+sh benchmark.sh -p model/ml-20m/neumf-quantized
+# optimized neumf int8 benchmark on ml-20m dataset
+sh benchmark.sh -p model/ml-20m/neumf-opt-quantized
 ```
diff --git a/example/neural_collaborative_filtering/benchmark.sh b/example/neural_collaborative_filtering/benchmark.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+usage()
+{
+    echo "usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]"
+}
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --prefix | -p)
+      shift
+      PREFIX=$1
+      ;;
+    --epoch | -e)
+      shift
+      EPOCH=$1
+      ;;
+    --dataset | -d)
+      shift
+      DATASET=$1
+      ;;
+    --batch-size | -b)
+      shift
+      BS=$1
+      ;;
+    --instance | -i)
+      shift
+      INS=$1
+      ;;
+    --core | -c)
+      shift
+      CORES=$1
+      ;;
+    --help | -h)
+      usage
+      exit 1
+      ;;
+    *)
+      usage
+      exit 1
+  esac
+  shift
+done
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+NUM_NUMA_NODE=`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+CORES_PER_NUMA=$((NUM_CORES / NUM_NUMA_NODE))
+echo "target machine has $NUM_CORES physical core(s) on $NUM_NUMA_NODE numa nodes of $NUM_SOCKET socket(s)."
+
+if [ -z $PREFIX ]; then
+  echo "Error: Need a model prefix."
+  exit
+fi
+if [ -z $EPOCH ]; then
+  echo "Default: set epoch of model parameters to 7."
+  EPOCH=7
+fi
+if [ -z $DATASET ]; then
+  echo "Default: set dataset to ml-20m."
+  DATASET='ml-20m'
+fi
+if [ -z $INS ]; then
+  echo "Default: launch one instance per physical core."
+  INS=$NUM_CORES
+fi
+if [ -z $CORES ]; then
+  echo "Default: divide full physical cores."
+  CORES=$((NUM_CORES / $INS))
+fi
+if [ -z $BS ]; then
+  echo "Default: set batch size to 700."
+  BS=700
+fi
+
+echo "  cores/instance: $CORES"
+echo "  total instances: $INS"
+echo "  batch size: $BS"
+echo ""
+
+rm NCF_*.log
+
+for((i=0;i<$INS;i++));
+do
+  ((a=$i*$CORES))
+  ((b=$a+$CORES-1))
+  memid=$((b/CORES_PER_NUMA))
+  LOG=NCF_$i.log
+  echo "  $i instance use $a-$b cores with $LOG"
+  KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 \
+  OMP_NUM_THREADS=$CORES \
+  numactl --physcpubind=$a-$b --membind=$memid python ncf.py --batch-size=$BS --dataset=$DATASET --epoch=$EPOCH --benchmark --prefix=$PREFIX 2>&1 | tee $LOG &
+done
+wait
+
+grep speed NCF_*.log | awk '{ sum += $(NF-1) }; END { print "Total Performance is " sum " samples/sec"}'
diff --git a/example/neural_collaborative_filtering/convert.py b/example/neural_collaborative_filtering/convert.py
@@ -38,7 +38,7 @@ def parse_args():
     parser = ArgumentParser()
     parser.add_argument('--dataset', nargs='?', default='ml-20m', choices=['ml-1m', 'ml-20m'],
                         help='The dataset name, temporary support ml-1m and ml-20m.')
-    parser.add_argument('path', type=str, default = './data/',
+    parser.add_argument('--path', type=str, default = './data/',
                         help='Path to reviews CSV file from MovieLens')
     parser.add_argument('-n', '--negatives', type=int, default=999,
                         help='Number of negative samples for each positive'

diff --git a/example/neural_collaborative_filtering/core/model.py b/example/neural_collaborative_filtering/core/model.py
@@ -37,6 +37,27 @@ def _init_weight(self, _, arr):
         limit = np.sqrt(3. / self._fan_in)
         mx.random.uniform(-limit, limit, out=arr)
 
+# only for inference model optimize
+def mlp_opt(user, item, factor_size, model_layers, max_user, max_item):
+    user_weight = mx.sym.Variable('fused_mlp_user_weight', init=mx.init.Normal(0.01))
+    item_weight = mx.sym.Variable('fused_mlp_item_weight', init=mx.init.Normal(0.01))
+    embed_user = mx.sym.Embedding(data=user, weight=user_weight, input_dim=max_user,
+                                  output_dim=factor_size * 2, name='fused_embed_user'+str(factor_size))
+    embed_item = mx.sym.Embedding(data=item, weight=item_weight, input_dim=max_item,
+                                  output_dim=factor_size * 2, name='fused_embed_item'+str(factor_size))
+    pre_gemm_concat = embed_user + embed_item
+
+    for i in range(1, len(model_layers)):
+        if i==1:
+            pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
+            continue
+        else:
+            mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
+        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
+        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
+        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
+
+    return pre_gemm_concat
 
 def mlp(user, item, factor_size, model_layers, max_user, max_item):
     user_weight = mx.sym.Variable('mlp_user_weight', init=mx.init.Normal(0.01))
@@ -47,14 +68,11 @@ def mlp(user, item, factor_size, model_layers, max_user, max_item):
                                   output_dim=factor_size, name='embed_item'+str(factor_size))
     pre_gemm_concat = mx.sym.concat(embed_user, embed_item, dim=1, name='pre_gemm_concat')
 
-    for i, layer in enumerate(model_layers):
-        if i==0:
-            mlp_weight_init = golorot_uniform(2 * factor_size, model_layers[i])
-        else:
-            mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
-        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i), init=mlp_weight_init)
-        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=layer, name='fc_'+str(i))
-        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i))
+    for i in range(1, len(model_layers)):
+        mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
+        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
+        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
+        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
 
     return pre_gemm_concat
 
@@ -70,24 +88,34 @@ def gmf(user, item, factor_size, max_user, max_item):
     return pred
 
 def get_model(model_type='neumf', factor_size_mlp=128, factor_size_gmf=64,
-              model_layers=[256, 128, 64], num_hidden=1, 
-              max_user=138493, max_item=26744):
+              model_layers=[256, 256, 128, 64], num_hidden=1, 
+              max_user=138493, max_item=26744, opt=False):
     # input
     user = mx.sym.Variable('user')
     item = mx.sym.Variable('item')
 
     if model_type == 'mlp':
-        net = mlp(user=user, item=item,
-                  factor_size=factor_size_mlp, model_layers=model_layers,
-                  max_user=max_user, max_item=max_item)
+        if opt:
+            net = mlp_opt(user=user, item=item,
+                         factor_size=factor_size_mlp, model_layers=model_layers,
+                         max_user=max_user, max_item=max_item)
+        else:
+            net = mlp(user=user, item=item,
+                      factor_size=factor_size_mlp, model_layers=model_layers,
+                      max_user=max_user, max_item=max_item)
     elif model_type == 'gmf':
         net = gmf(user=user, item=item,
                   factor_size=factor_size_gmf,
                   max_user=max_user, max_item=max_item)
     elif model_type == 'neumf':
-        net_mlp = mlp(user=user, item=item,
-                      factor_size=factor_size_mlp, model_layers=model_layers,
-                      max_user=max_user, max_item=max_item)
+        if opt:
+            net_mlp = mlp_opt(user=user, item=item,
+                              factor_size=factor_size_mlp, model_layers=model_layers,
+                              max_user=max_user, max_item=max_item)
+        else:
+            net_mlp = mlp(user=user, item=item,
+                          factor_size=factor_size_mlp, model_layers=model_layers,
+                          max_user=max_user, max_item=max_item)
         net_gmf = gmf(user=user, item=item,
                       factor_size=factor_size_gmf,
                       max_user=max_user, max_item=max_item)