ci(wheel): build PT OPs (deepmodeling#3894)

Build PT OPs for our wheels (against the CPU version of PT but work with the GPU version). Add PT to the build dependencies if it is not found. Bump MPICH to the latest version. Update documentation to address that PT support for LAMMPS and i-PI is not included.  ## Summary by CodeRabbit - **New Features** - Introduced PyTorch support across various backend functionalities, enhancing compatibility and capabilities alongside existing TensorFlow support. - **Documentation** - Updated installation guide to include warnings about TensorFlow backend support and removed outdated notes about macOS arm64 package support. - **Chores** - Updated dependency versions and build configurations to include new PyTorch settings and improved compatibility for macOS and Linux. - Adjusted build scripts to conditionally link libraries based on new PyTorch-related parameters.  --------- Signed-off-by: Jinzhe Zeng <[email protected]>
mtaillefumier · Sep 18, 2024 · ad226aa · ad226aa
1 parent 10b752f
commit ad226aa
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 26 deletions.
diff --git a/backend/dp_backend.py b/backend/dp_backend.py
@@ -7,6 +7,9 @@
 
 from scikit_build_core import build as _orig
 
+from .find_pytorch import (
+    find_pytorch,
+)
 from .find_tensorflow import (
     find_tensorflow,
 )
@@ -40,10 +43,18 @@ def __dir__() -> List[str]:
 def get_requires_for_build_wheel(
     config_settings: dict,
 ) -> List[str]:
-    return _orig.get_requires_for_build_wheel(config_settings) + find_tensorflow()[1]
+    return (
+        _orig.get_requires_for_build_wheel(config_settings)
+        + find_tensorflow()[1]
+        + find_pytorch()[1]
+    )
 
 
 def get_requires_for_build_editable(
     config_settings: dict,
 ) -> List[str]:
-    return _orig.get_requires_for_build_editable(config_settings) + find_tensorflow()[1]
+    return (
+        _orig.get_requires_for_build_editable(config_settings)
+        + find_tensorflow()[1]
+        + find_pytorch()[1]
+    )
diff --git a/backend/dynamic_metadata.py b/backend/dynamic_metadata.py
@@ -9,6 +9,9 @@
     Optional,
 )
 
+from .find_pytorch import (
+    get_pt_requirement,
+)
 from .find_tensorflow import (
     get_tf_requirement,
 )
@@ -33,7 +36,9 @@ def dynamic_metadata(
     settings: Optional[Dict[str, object]] = None,
 ):
     assert field in ["optional-dependencies", "entry-points", "scripts"]
-    _, _, find_libpython_requires, extra_scripts, tf_version = get_argument_from_env()
+    _, _, find_libpython_requires, extra_scripts, tf_version, pt_version = (
+        get_argument_from_env()
+    )
     with Path("pyproject.toml").open("rb") as f:
         pyproject = tomllib.load(f)
 
@@ -51,4 +56,5 @@ def dynamic_metadata(
         return {
             **optional_dependencies,
             **get_tf_requirement(tf_version),
+            **get_pt_requirement(pt_version),
         }
diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import importlib
 import os
 import site
 from functools import (
@@ -17,12 +18,19 @@
     get_path,
 )
 from typing import (
+    List,
     Optional,
+    Tuple,
+    Union,
+)
+
+from packaging.version import (
+    Version,
 )
 
 
 @lru_cache
-def find_pytorch() -> Optional[str]:
+def find_pytorch() -> Tuple[Optional[str], List[str]]:
     """Find PyTorch library.
 
     Tries to find PyTorch in the order of:
@@ -39,9 +47,12 @@ def find_pytorch() -> Optional[str]:
     -------
     str, optional
         PyTorch library path if found.
+    list of str
+        TensorFlow requirement if not found. Empty if found.
     """
     if os.environ.get("DP_ENABLE_PYTORCH", "0") == "0":
-        return None
+        return None, []
+    requires = []
     pt_spec = None
 
     if (pt_spec is None or not pt_spec) and os.environ.get("PYTORCH_ROOT") is not None:
@@ -73,4 +84,62 @@ def find_pytorch() -> Optional[str]:
         # IndexError if submodule_search_locations is an empty list
     except (AttributeError, TypeError, IndexError):
         pt_install_dir = None
-    return pt_install_dir
+        requires.extend(get_pt_requirement()["torch"])
+    return pt_install_dir, requires
+
+
+@lru_cache
+def get_pt_requirement(pt_version: str = "") -> dict:
+    """Get PyTorch requirement when PT is not installed.
+
+    If pt_version is not given and the environment variable `PYTORCH_VERSION` is set, use it as the requirement.
+
+    Parameters
+    ----------
+    pt_version : str, optional
+        PT version
+
+    Returns
+    -------
+    dict
+        PyTorch requirement.
+    """
+    if pt_version is None:
+        return {"torch": []}
+    if pt_version == "":
+        pt_version = os.environ.get("PYTORCH_VERSION", "")
+
+    return {
+        "torch": [
+            # uv has different local version behaviors, i.e. `==2.3.1` cannot match `==2.3.1+cpu`
+            # https://github.com/astral-sh/uv/blob/main/PIP_COMPATIBILITY.md#local-version-identifiers
+            # luckily, .* (prefix matching) defined in PEP 440 can match any local version
+            # https://peps.python.org/pep-0440/#version-matching
+            f"torch=={Version(pt_version).base_version}.*"
+            if pt_version != ""
+            else "torch>=2a",
+        ],
+    }
+
+
+@lru_cache
+def get_pt_version(pt_path: Optional[Union[str, Path]]) -> str:
+    """Get TF version from a TF Python library path.
+
+    Parameters
+    ----------
+    pt_path : str or Path
+        PT Python library path
+
+    Returns
+    -------
+    str
+        version
+    """
+    if pt_path is None or pt_path == "":
+        return ""
+    version_file = Path(pt_path) / "version.py"
+    spec = importlib.util.spec_from_file_location("torch.version", version_file)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.__version__
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
@@ -204,7 +204,7 @@ def get_tf_requirement(tf_version: str = "") -> dict:
 
 
 @lru_cache
-def get_tf_version(tf_path: Union[str, Path]) -> str:
+def get_tf_version(tf_path: Optional[Union[str, Path]]) -> str:
     """Get TF version from a TF Python library path.
 
     Parameters

diff --git a/backend/read_env.py b/backend/read_env.py
@@ -15,6 +15,7 @@
 
 from .find_pytorch import (
     find_pytorch,
+    get_pt_version,
 )
 from .find_tensorflow import (
     find_tensorflow,
@@ -23,7 +24,7 @@
 
 
 @lru_cache
-def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
+def get_argument_from_env() -> Tuple[str, list, list, dict, str, str]:
     """Get the arguments from environment variables.
 
     The environment variables are assumed to be not changed during the build.
@@ -40,6 +41,8 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
         The extra scripts to be installed.
     str
         The TensorFlow version.
+    str
+        The PyTorch version.
     """
     cmake_args = []
     extra_scripts = {}
@@ -103,9 +106,8 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
         tf_version = None
 
     if os.environ.get("DP_ENABLE_PYTORCH", "0") == "1":
-        pt_install_dir = find_pytorch()
-        if pt_install_dir is None:
-            raise RuntimeError("Cannot find installed PyTorch.")
+        pt_install_dir, _ = find_pytorch()
+        pt_version = get_pt_version(pt_install_dir)
         cmake_args.extend(
             [
                 "-DENABLE_PYTORCH=ON",
@@ -114,6 +116,7 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
         )
     else:
         cmake_args.append("-DENABLE_PYTORCH=OFF")
+        pt_version = None
 
     cmake_args = [
         "-DBUILD_PY_IF:BOOL=TRUE",
@@ -125,11 +128,12 @@ def get_argument_from_env() -> Tuple[str, list, list, dict, str]:
         find_libpython_requires,
         extra_scripts,
         tf_version,
+        pt_version,
     )
 
 
 def set_scikit_build_env():
     """Set scikit-build environment variables before executing scikit-build."""
-    cmake_minimum_required_version, cmake_args, _, _, _ = get_argument_from_env()
+    cmake_minimum_required_version, cmake_args, _, _, _, _ = get_argument_from_env()
     os.environ["SKBUILD_CMAKE_MINIMUM_VERSION"] = cmake_minimum_required_version
     os.environ["SKBUILD_CMAKE_ARGS"] = ";".join(cmake_args)
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
@@ -132,7 +132,11 @@ pip install deepmd-kit[cpu]
 pip install deepmd-kit[gpu,cu12,torch,lmp,ipi]
 ```
 
-MPICH is required for parallel running. (The macOS arm64 package doesn't support MPI yet.)
+MPICH is required for parallel running.
+
+:::{Warning}
+When installing from pip, only the TensorFlow {{ tensorflow_icon }} backend is supported with LAMMPS and i-PI.
+:::
 
 It is suggested to install the package into an isolated environment.
 The supported platform includes Linux x86-64 and aarch64 with GNU C Library 2.28 or above, macOS x86-64 and arm64, and Windows x86-64.

diff --git a/pyproject.toml b/pyproject.toml
@@ -126,9 +126,6 @@ cu12 = [
     "nvidia-cudnn-cu12<9",
     "nvidia-cuda-nvcc-cu12",
 ]
-torch = [
-    "torch>=2a",
-]
 
 [tool.deepmd_build_backend.scripts]
 dp = "deepmd.main:main"
@@ -198,56 +195,84 @@ replacement = '\1="https://github.com/deepmodeling/deepmd-kit/raw/master/\g<2>"'
 [tool.cibuildwheel]
 test-command = [
     "python -m deepmd -h",
+    """python -c "import deepmd.tf;import deepmd.pt" """,
     "dp -h",
     "dp_ipi",
     "pytest {project}/source/tests/tf/test_lammps.py"
 ]
-test-extras = ["cpu", "test", "lmp", "ipi"]
-build = ["cp310-*"]
+test-extras = ["cpu", "test", "lmp", "ipi", "torch"]
+build = ["cp311-*"]
 skip = ["*-win32", "*-manylinux_i686", "*-musllinux*"]
 # TODO: uncomment to use the latest image when CUDA 11 is deprecated
 # manylinux-x86_64-image = "manylinux_2_28"
 manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2022-11-19-1b19e81"
 manylinux-aarch64-image = "manylinux_2_28"
 
 [tool.cibuildwheel.macos]
-environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update3", DP_ENABLE_IPI="1" }
 before-all = [
-    """brew install mpich""",
+    '''pip install -i https://pypi.anaconda.org/mpi4py/simple mpich''',
 ]
 repair-wheel-command = """delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel} --ignore-missing-dependencies"""
 
+[tool.cibuildwheel.macos.environment]
+PIP_PREFER_BINARY = "1"
+DP_LAMMPS_VERSION = "stable_2Aug2023_update3"
+DP_ENABLE_IPI = "1"
+DP_ENABLE_PYTORCH = "1"
+# for unclear reason, when enabling PyTorch, OpenMP is found accidentally
+CMAKE_ARGS = "-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=1"
+
+[[tool.cibuildwheel.overrides]]
+# error: 'value' is unavailable: introduced in macOS 10.13
+select = "*-macosx_x86_64"
+inherit.environment = "append"
+environment.MACOSX_DEPLOYMENT_TARGET = "10.13"
+
 [tool.cibuildwheel.linux]
-repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 -w {dest_dir} {wheel}"
+repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 --exclude libc10.so --exclude libtorch.so --exclude libtorch_cpu.so -w {dest_dir} {wheel}"
 environment-pass = [
     "CIBW_BUILD",
     "DP_VARIANT",
     "CUDA_VERSION",
     "DP_PKG_NAME",
     "SETUPTOOLS_SCM_PRETEND_VERSION",
 ]
-environment = { PIP_PREFER_BINARY="1", DP_LAMMPS_VERSION="stable_2Aug2023_update3", DP_ENABLE_IPI="1", MPI_HOME="/usr/lib64/mpich", PATH="/usr/lib64/mpich/bin:$PATH" }
 before-all = [
     """if [ ! -z "${DP_PKG_NAME}" ]; then sed -i "s/name = \\"deepmd-kit\\"/name = \\"${DP_PKG_NAME}\\"/g" pyproject.toml; fi""",
     # https://almalinux.org/blog/2023-12-20-almalinux-8-key-update/
     """rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux""",
     """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-${CUDA_VERSION/./-} cuda-cudart-devel-${CUDA_VERSION/./-}; fi }""",
-    "yum install -y mpich-devel",
+    '''/opt/python/cp311-cp311/bin/python -m pip install -i https://pypi.anaconda.org/mpi4py/simple mpich''',
     # uv is not available in the old manylinux image
     """{ if [ "$(uname -m)" = "x86_64" ] ; then pipx install uv; fi }""",
 ]
 before-build = [
     # old build doesn't support uv
     """{ if [ "$(uname -m)" = "x86_64" ] ; then uv pip install --system -U build; fi }""",
 ]
+[tool.cibuildwheel.linux.environment]
+PIP_PREFER_BINARY = "1"
+DP_LAMMPS_VERSION = "stable_2Aug2023_update3"
+DP_ENABLE_IPI = "1"
+DP_ENABLE_PYTORCH = "1"
+MPI_HOME = "/usr/lib64/mpich"
+PATH = "/usr/lib64/mpich/bin:$PATH"
+# use CPU version of torch for building, which should also work for GPU
+# note: uv has different behavior from pip on extra index url
+# https://github.com/astral-sh/uv/blob/main/PIP_COMPATIBILITY.md#packages-that-exist-on-multiple-indexes
+UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"
+# trick to find the correction version of mpich
+CMAKE_PREFIX_PATH="/opt/python/cp311-cp311/"
 
 [tool.cibuildwheel.windows]
-environment = { PIP_PREFER_BINARY="1" }
-test-extras = ["cpu"]
+test-extras = ["cpu", "torch"]
 test-command = [
     "python -m deepmd -h",
     "dp -h",
 ]
+[tool.cibuildwheel.windows.environment]
+PIP_PREFER_BINARY = "1"
+DP_ENABLE_PYTORCH = "1"
 
 # One can run `tox` or `tox -e gpu`
 # to run pytest in an isolated environment

diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
@@ -16,7 +16,10 @@ if(ENABLE_TENSORFLOW)
                                            TensorFlow::tensorflow_framework)
   target_compile_definitions(${libname} PRIVATE BUILD_TENSORFLOW)
 endif()
-if(ENABLE_PYTORCH AND "${OP_CXX_ABI_PT}" EQUAL "${OP_CXX_ABI}")
+if(ENABLE_PYTORCH
+   AND "${OP_CXX_ABI_PT}" EQUAL "${OP_CXX_ABI}"
+       # LAMMPS and i-PI in the Python package are not ready - needs more work
+   AND NOT BUILD_PY_IF)
   target_link_libraries(${libname} PRIVATE "${TORCH_LIBRARIES}")
   target_compile_definitions(${libname} PRIVATE BUILD_PYTORCH)
 endif()