Build libtorch with new gcc ABI (pytorch#335)

* [WIP] * Move zip openssl installation out of build_common.sh * fix libgomp * try to fix * improve build_cpu.sh as well * combine GLIBCXX_USE_CXX11_ABI setting * add comment * fix _GLIBCXX_USE_CXX11_ABI flag passing * fix PWD * further test comment * fix comments * fix abi check * update comments * improve DEPS_LIST * remove CXX_ABI_VARIANT * add identifier to new ABI binary zip * only check gcc ABI for libtorch builds * skip ABI test for devtoolset7 * DEBUG: CUDA version * improve CUDA version checking * better message * Switch /usr/local/cuda to point to correct CUDA version * fix libgomp path for CUDA * fix abi symbol checking * check number of correct ABI symbols as well * no need to check c10 library for ABI symbols * improve LIBTORCH_NAMESPACE_LIST * run ABI check on all binaries * DEBUG * DEBUG python version problem * Revert "DEBUG" This reverts commit df05ed1. * Revert "DEBUG python version problem" This reverts commit e13bb2b. * add yum install -q -y zip openssl back
atalman · Aug 14, 2019 · d7348b4 · d7348b4
1 parent b9ac256
commit d7348b4
Show file tree

Hide file tree

Showing 7 changed files with 172 additions and 73 deletions.
diff --git a/check_binary.sh b/check_binary.sh
@@ -24,7 +24,8 @@ set -eux -o pipefail
 # The install root depends on both the package type and the os
 # All MacOS packages use conda, even for the wheel packages.
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  install_root="$pwd"
+  # NOTE: Only $PWD works on both CentOS and Ubuntu
+  install_root="$PWD"
 else
   py_dot="${DESIRED_PYTHON:0:3}"
   install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
@@ -34,24 +35,27 @@ fi
 ###############################################################################
 # Check GCC ABI
 ###############################################################################
+
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
+
 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
   function is_expected() {
-    # This commented out logic is what you'd expect if 'devtoolset7' actually
-    # built with the new GCC ABI, but it doesn't; it always builds with ABI=0.
-    # When a compiler is added that does build with new ABI, then replace
-    # devtoolset7 (and the DESIRED_DEVTOOLSET variable) with your new compiler
-    #if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then
-    #  if [[ "$1" -gt 0 || "$1" == "ON" ]]; then
-    #    echo 1
-    #  fi
-    #else
-    #  if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
-    #    echo 1
-    #  fi
-    #fi
-    if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
-      echo 1
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
     fi
   }
 
@@ -90,26 +94,75 @@ if [[ "$(uname)" != 'Darwin' ]]; then
   fi
 
   # We also check that there are [not] cxx11 symbols in libtorch
-  # TODO this doesn't catch everything. Even when building with the old ABI
-  # there are 44 symbols in the new ABI in the libtorch library, making this
-  # check return true. This should actually check that the number of new ABI
-  # symbols is sufficiently large.
-  # Also, this is wrong on the old ABI, since there are some cxx11 symbols with
-  # devtoolset7.
-  #echo "Checking that symbols in libtorch.so have the right gcc abi"
-  #libtorch="${install_root}/lib/libtorch.so"
-  #cxx11_symbols="$(nm "$libtorch" | c++filt | grep __cxx11 | wc -l)" || true
-  #if [[ "$(is_expected $cxx11_symbols)" != 1 ]]; then
-  #  if [[ "$cxx11_symbols" == 0 ]]; then
-  #    echo "No cxx11 symbols found, but there should be."
-  #  else
-  #    echo "Found cxx11 symbols but there shouldn't be. Dumping symbols"
-  #    nm "$libtorch" | c++filt | grep __cxx11
-  #  fi
-  #  exit 1
-  #else
-  #  echo "cxx11 symbols seem to be in order"
-  #fi
+  #
+  # To check whether it is using cxx11 ABI, check non-existence of symbol:
+  PRE_CXX11_SYMBOLS=(
+    "std::basic_string"
+    "std::list"
+  )
+  # To check whether it is using pre-cxx11 ABI, check non-existence of symbol:
+  CXX11_SYMBOLS=(
+    "std::__cxx11::basic_string"
+    "std::__cxx11::list"
+  )
+  # NOTE: Checking the above symbols in all namespaces doesn't work, because
+  # devtoolset7 always produces some cxx11 symbols even if we build with old ABI,
+  # and CuDNN always has pre-cxx11 symbols even if we build with new ABI using gcc 5.4.
+  # Instead, we *only* check the above symbols in the following namespaces:
+  LIBTORCH_NAMESPACE_LIST=(
+    "c10::"
+    "at::"
+    "caffe2::"
+    "torch::"
+  )
+  echo "Checking that symbols in libtorch.so have the right gcc abi"
+  grep_symbols () {
+    symbols=("$@")
+    for namespace in "${LIBTORCH_NAMESPACE_LIST[@]}"
+    do
+      for symbol in "${symbols[@]}"
+      do
+        nm "$lib" | c++filt | grep " $namespace".*$symbol
+      done
+    done
+  }
+  check_lib_symbols_for_abi_correctness () {
+    lib=$1
+    echo "lib: " $lib
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+      num_pre_cxx11_symbols=$(grep_symbols "${PRE_CXX11_SYMBOLS[@]}" | wc -l) || true
+      echo "num_pre_cxx11_symbols: " $num_pre_cxx11_symbols
+      if [[ "$num_pre_cxx11_symbols" -gt 0 ]]; then
+        echo "Found pre-cxx11 symbols but there shouldn't be. Dumping symbols"
+        grep_symbols "${PRE_CXX11_SYMBOLS[@]}"
+        exit 1
+      fi
+      num_cxx11_symbols=$(grep_symbols "${CXX11_SYMBOLS[@]}" | wc -l) || true
+      echo "num_cxx11_symbols: " $num_cxx11_symbols
+      if [[ "$num_cxx11_symbols" -lt 1000 ]]; then
+        echo "Didn't find enough cxx11 symbols. Aborting."
+        exit 1
+      fi
+    else
+      num_cxx11_symbols=$(grep_symbols "${CXX11_SYMBOLS[@]}" | wc -l) || true
+      echo "num_cxx11_symbols: " $num_cxx11_symbols
+      if [[ "$num_cxx11_symbols" -gt 0 ]]; then
+        echo "Found cxx11 symbols but there shouldn't be. Dumping symbols"
+        grep_symbols "${CXX11_SYMBOLS[@]}"
+        exit 1
+      fi
+      num_pre_cxx11_symbols=$(grep_symbols "${PRE_CXX11_SYMBOLS[@]}" | wc -l) || true
+      echo "num_pre_cxx11_symbols: " $num_pre_cxx11_symbols
+      if [[ "$num_pre_cxx11_symbols" -lt 1000 ]]; then
+        echo "Didn't find enough pre-cxx11 symbols. Aborting."
+        exit 1
+      fi
+    fi
+  }
+  libtorch="${install_root}/lib/libtorch.so"
+  check_lib_symbols_for_abi_correctness $libtorch
+
+  echo "cxx11 symbols seem to be in order"
 fi # if on Darwin
 
 ###############################################################################
@@ -160,6 +213,7 @@ fi
 ###############################################################################
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
   # For libtorch testing is done. All further tests require Python
+  # TODO: We should run those further tests for libtorch as well
   exit 0
 fi
 python -c 'import torch'

diff --git a/conda/.DS_Store b/conda/.DS_Store
diff --git a/manywheel/build.sh b/manywheel/build.sh
@@ -22,8 +22,22 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
 fi
 
 # Determine CUDA version and architectures to build for
-CUDA_VERSION=$(nvcc --version|tail -n1|cut -f5 -d" "|cut -f1 -d",")
-echo "CUDA $CUDA_VERSION Detected"
+#
+# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
+# because in some cases a single Docker image can have multiple CUDA versions
+# on it, and `nvcc --version` might not show the CUDA version we want.
+if [[ -n "$DESIRED_CUDA" ]]; then
+    # cu90, cu92, cu100, cu101
+    if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
+        CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
+    elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+        CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
+    fi
+    echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA"
+else
+    CUDA_VERSION=$(nvcc --version|tail -n1|cut -f5 -d" "|cut -f1 -d",")
+    echo "CUDA $CUDA_VERSION Detected"
+fi
 
 export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX"
 if [[ $CUDA_VERSION == "9.0" ]]; then
@@ -60,13 +74,20 @@ if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 
+OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    LIBGOMP_PATH="/usr/lib/gcc/x86_64-linux-gnu/5/libgomp.so"
+fi
+
 if [[ $CUDA_VERSION == "9.0" ]]; then
 DEPS_LIST=(
     "/usr/local/cuda/lib64/libcudart.so.9.0"
     "/usr/local/cuda/lib64/libnvToolsExt.so.1"
     "/usr/local/cuda/lib64/libnvrtc.so.9.0"
     "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-    "/usr/lib64/libgomp.so.1"
+    "$LIBGOMP_PATH"
 )
 
 DEPS_SONAME=(
@@ -82,7 +103,7 @@ DEPS_LIST=(
     "/usr/local/cuda/lib64/libnvToolsExt.so.1"
     "/usr/local/cuda/lib64/libnvrtc.so.9.2"
     "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-    "/usr/lib64/libgomp.so.1"
+    "$LIBGOMP_PATH"
 )
 
 DEPS_SONAME=(
@@ -98,7 +119,7 @@ DEPS_LIST=(
     "/usr/local/cuda/lib64/libnvToolsExt.so.1"
     "/usr/local/cuda/lib64/libnvrtc.so.10.0"
     "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-    "/usr/lib64/libgomp.so.1"
+    "$LIBGOMP_PATH"
 )
 
 DEPS_SONAME=(
@@ -114,7 +135,7 @@ DEPS_LIST=(
     "/usr/local/cuda/lib64/libnvToolsExt.so.1"
     "/usr/local/cuda/lib64/libnvrtc.so.10.1"
     "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-    "/usr/lib64/libgomp.so.1"
+    "$LIBGOMP_PATH"
 )
 
 DEPS_SONAME=(
@@ -132,6 +153,12 @@ fi
 # builder/test.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"
 
+# Switch `/usr/local/cuda` to the desired CUDA version
+rm -rf /usr/local/cuda || true
+ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
+export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
+export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
+export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
 
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 source $SCRIPTPATH/build_common.sh
diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh
@@ -20,7 +20,13 @@ retry () {
 }
 
 # TODO move this into the Docker images
-retry yum install -q -y zip openssl
+OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    retry apt-get update
+    retry apt-get -y install zip openssl
+fi
 
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
@@ -97,6 +103,13 @@ if [[ "$DESIRED_PYTHON"  == "cp37-cp37m" ]]; then
 else
     retry pip install -q numpy==1.11
 fi
+
+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 echo "Calling setup.py bdist at $(date)"
 time CMAKE_ARGS=${CMAKE_ARGS[@]} \
      EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
@@ -142,9 +155,16 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then
     echo "$(pushd $pytorch_rootdir && git rev-parse HEAD)" > libtorch/build-hash
 
     mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
-    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
-    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
-       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-latest.zip
+
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
+    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
+    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
+       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
 fi
 
 popd

diff --git a/manywheel/build_cpu.sh b/manywheel/build_cpu.sh
@@ -26,9 +26,15 @@ if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 
+OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    LIBGOMP_PATH="/usr/lib/gcc/x86_64-linux-gnu/5/libgomp.so"
+fi
 
 DEPS_LIST=(
-    "/usr/lib64/libgomp.so.1"
+    "$LIBGOMP_PATH"
 )
 
 DEPS_SONAME=(

diff --git a/smoke_test.sh b/smoke_test.sh
@@ -48,7 +48,12 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
   else
     libtorch_variant="$LIBTORCH_VARIANT"
   fi
-  package_name="libtorch-$libtorch_variant-${NIGHTLIES_DATE_PREAMBLE}${DATE}.zip"
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+      LIBTORCH_ABI="cxx11-abi-"
+  else
+      LIBTORCH_ABI=
+  fi
+  package_name="libtorch-$LIBTORCH_ABI$libtorch_variant-${NIGHTLIES_DATE_PREAMBLE}${DATE}.zip"
 elif [[ "$PACKAGE_TYPE" == *wheel ]]; then
   package_name='torch'
 elif [[ "$DESIRED_CUDA" == 'cpu' && "$(uname)" != 'Darwin' ]]; then

diff --git a/update_compiler.sh b/update_compiler.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -ex
 
+# NOTE: This script is called by default on all nightlies.
+
 # Expected to be run on a Docker image built from
 # https://github.com/pytorch/builder/blob/master/conda/Dockerfile (or the
 # manywheel equivalent)
@@ -10,34 +12,19 @@ set -ex
 # Why does this file exist? Why not just update the compiler on the base docker
 # images?
 #
-# So, all the nightlies used to be built on devtoolset3 with the old gcc ABI.
-# These packages worked well for most people, but could not be linked against
-# by client c++ libraries that were compiled using the new gcc ABI. Since both
-# gcc ABIs are still common in the wild, we should be able to support both
-# ABIs. Hence, we need a script to alter the compiler on the root docker images
-# to configure which ABI we want to build with.
+# Answer: Yes we should just update the compiler to devtoolset7 on all the CentOS
+# base docker images. There's no reason to keep around devtoolset3 because it's
+# not used anymore.
 #
-# So then this script was written to change from devtoolset3 to devtoolset7. It
-# turns out that this doesn't actually fix the problem, since devtoolset7 is
-# incapable of building with the new gcc ABI. BUT, devtoolset7 /is/ able to
+# We use devtoolset7 instead of devtoolset3 because devtoolset7 /is/ able to
 # build with avx512 instructions, which are needed for fbgemm to get good
-# performance. So now this script is called by default on all nightlies.
-#
-# But we still don't have the new gcc ABI. So what should happen next is
-# - Upgrade the compiler on all the base docker images to be devtoolset7.
-#   There's no reason to keep around devtoolset3. It will never be used.
-# - Alter this script to install another compiler toolchain, not a devtoolset#,
-#   which can build with the new gcc ABI. Then use this script as intended, in
-#   a parallel suite of new-gcc-ABI nightlies.
+# performance.
 #
-# When this script is finally changed to build with the new gcc ABI, then we'll
-# need to set this variable manually because
-# https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp sets the
-# ABI to 0 by default. 
-# ``` export _GLIBCXX_USE_CXX11_ABI=1 ```
-# Note that this probably needs to get set in the .circleci infra that's
-# running this, since env variables set in this file are probably discarded.
-# ~~~
+# Note that devtoolset7 still *cannot* build with the new gcc ABI
+# (see https://bugzilla.redhat.com/show_bug.cgi?id=1546704). Instead, we use
+# Ubuntu 16.04 + gcc 5.4 to build with the new gcc ABI, using an Ubuntu 16.04
+# base docker image.
+# For details, see NOTE [ Building libtorch with old vs. new gcc ABI ].
 
 # The gcc version should be 4.9.2 right now
 echo "Initial gcc version is $(gcc --version)"