From d095d458b1436a46fbe558008fd460aa1c95ee91 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 16 Oct 2024 21:49:03 +0900 Subject: [PATCH] win32+clang support * based on Windows support PR #2465 by @andreigh - https://github.com/openai/triton/pull/2465 * manually applied, rebased, fix lint errors * use sysconfig.get_config_var() to get the path of python*.lib * clang fix for windows * remove '-fPIC' for windows clang * fix download_and_copy() to support windows * add "exe" extension for windows * use "pyd" extension for windows to make importlib work * third_party/nvidia: fix for windows * win32 fix _path_to_binary() * add library_dir, include_dir for win32 * backend/compiler lazy remove temp files to support windows * additional works done by @mantaionut (2024/05/31) * rework for latest triton and cleanup (2024/10/14) * extract minimal fixes to support win32+clang (2024/10/16) * get exe/so extension using sysconfig (suggested by @anmyachev) see also: https://github.com/intel/intel-xpu-backend-for-triton/pull/2478 Original-author-by: Andrei Gheorghe Signed-off-by: Won-Kyu Park --- python/setup.py | 32 +++++++++++++++++--------- python/triton/backends/compiler.py | 11 ++++++--- python/triton/compiler/compiler.py | 4 +++- python/triton/runtime/build.py | 1 + third_party/nvidia/backend/compiler.py | 17 +++++++------- third_party/nvidia/backend/driver.py | 15 ++++++++++-- 6 files changed, 55 insertions(+), 25 deletions(-) diff --git a/python/setup.py b/python/setup.py index 714668462f0e2..1a7b114b81eaf 100644 --- a/python/setup.py +++ b/python/setup.py @@ -167,7 +167,7 @@ def get_json_package_info(): def get_llvm_package_info(): system = platform.system() try: - arch = {"x86_64": "x64", "arm64": "arm64", "aarch64": "arm64"}[platform.machine()] + arch = {"x86_64": "x64", "AMD64": "x64", "arm64": "arm64", "aarch64": "arm64"}[platform.machine()] except KeyError: arch = platform.machine() if system == "Darwin": @@ -196,6 +196,8 @@ def get_llvm_package_info(): f"LLVM pre-compiled image is not available for {system}-{arch}. Proceeding with user-configured LLVM from source build." ) return Package("llvm", "LLVM-C.lib", "", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR", "LLVM_SYSPATH") + elif system == "Windows": + system_suffix = f"windows-{arch}" else: print( f"LLVM pre-compiled image is not available for {system}-{arch}. Proceeding with user-configured LLVM from source build." @@ -281,10 +283,13 @@ def download_and_copy(name, src_path, dst_path, variable, version, url_func): base_dir = os.path.dirname(__file__) system = platform.system() try: - arch = {"x86_64": "64", "arm64": "aarch64", "aarch64": "aarch64"}[platform.machine()] + arch = {"x86_64": "64", "AMD64": "64", "arm64": "aarch64", "aarch64": "aarch64"}[platform.machine()] except KeyError: arch = platform.machine() - supported = {"Linux": "linux", "Darwin": "linux"} + supported = {"Linux": "linux", "Darwin": "linux", "Windows": "win"} + if system not in supported: + return + url = url_func(supported[system], arch, version) tmp_path = os.path.join(triton_cache_path, "nvidia", name) # path to cache the download dst_path = os.path.join(base_dir, os.pardir, "third_party", "nvidia", "backend", dst_path) # final binary path @@ -292,7 +297,7 @@ def download_and_copy(name, src_path, dst_path, variable, version, url_func): src_path = src_path(platform_name, version) if callable(src_path) else src_path src_path = os.path.join(tmp_path, src_path) download = not os.path.exists(src_path) - if os.path.exists(dst_path) and system == "Linux" and shutil.which(dst_path) is not None: + if os.path.exists(dst_path) and system in ("Linux", "Windows") and shutil.which(dst_path) is not None: curr_version = subprocess.check_output([dst_path, "--version"]).decode("utf-8").strip() curr_version = re.search(r"V([.|\d]+)", curr_version).group(1) download = download or curr_version != version @@ -421,6 +426,10 @@ def build_extension(self, ext): "-DTRITON_CODEGEN_BACKENDS=" + ';'.join([b.name for b in backends if not b.is_external]), "-DTRITON_PLUGIN_DIRS=" + ';'.join([b.src_dir for b in backends if b.is_external]) ] + if platform.system() == "Windows": + installed_base = sysconfig.get_config_var('installed_base') + py_lib_dirs = os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs")) + cmake_args.append("-DPYTHON_LIB_DIRS=" + py_lib_dirs) if lit_dir is not None: cmake_args.append("-DLLVM_EXTERNAL_LIT=" + lit_dir) cmake_args.extend(thirdparty_cmake_args) @@ -430,9 +439,8 @@ def build_extension(self, ext): build_args = ["--config", cfg] if platform.system() == "Windows": + cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] cmake_args += [f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] - if sys.maxsize > 2**32: - cmake_args += ["-A", "x64"] else: cmake_args += ["-DCMAKE_BUILD_TYPE=" + cfg] max_jobs = os.getenv("MAX_JOBS", str(2 * os.cpu_count())) @@ -499,8 +507,10 @@ def get_platform_dependent_src_path(subdir): if int(version_major) >= 12 and int(version_minor1) >= 5 else subdir)(*version.split('.'))) +exe = sysconfig.get_config_var("EXE") + download_and_copy( - name="ptxas", src_path="bin/ptxas", dst_path="bin/ptxas", variable="TRITON_PTXAS_PATH", + name="ptxas", src_path=f"bin/ptxas{exe}", dst_path=f"bin/ptxas{exe}", variable="TRITON_PTXAS_PATH", version=NVIDIA_TOOLCHAIN_VERSION["ptxas"], url_func=lambda system, arch, version: ((lambda version_major, version_minor1, version_minor2: f"https://anaconda.org/nvidia/cuda-nvcc-tools/{version}/download/{system}-{arch}/cuda-nvcc-tools-{version}-0.tar.bz2" @@ -509,8 +519,8 @@ def get_platform_dependent_src_path(subdir): (*version.split('.')))) download_and_copy( name="cuobjdump", - src_path="bin/cuobjdump", - dst_path="bin/cuobjdump", + src_path=f"bin/cuobjdump{exe}", + dst_path=f"bin/cuobjdump{exe}", variable="TRITON_CUOBJDUMP_PATH", version=NVIDIA_TOOLCHAIN_VERSION["cuobjdump"], url_func=lambda system, arch, version: @@ -518,8 +528,8 @@ def get_platform_dependent_src_path(subdir): ) download_and_copy( name="nvdisasm", - src_path="bin/nvdisasm", - dst_path="bin/nvdisasm", + src_path=f"bin/nvdisasm{exe}", + dst_path=f"bin/nvdisasm{exe}", variable="TRITON_NVDISASM_PATH", version=NVIDIA_TOOLCHAIN_VERSION["nvdisasm"], url_func=lambda system, arch, version: diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py index f2ba8eac807fc..9b864932163c7 100644 --- a/python/triton/backends/compiler.py +++ b/python/triton/backends/compiler.py @@ -2,6 +2,7 @@ import re import hashlib import subprocess +import sysconfig from abc import ABCMeta, abstractmethod, abstractclassmethod from dataclasses import dataclass @@ -228,20 +229,24 @@ def __init__(self, target: GPUTarget) -> None: @staticmethod def _path_to_binary(binary: str): + exe = sysconfig.get_config_var("EXE") base_dir = os.path.join(os.path.dirname(__file__), os.pardir) paths = [ os.environ.get(f"TRITON_{binary.upper()}_PATH", ""), - os.path.join(base_dir, "third_party", "cuda", "bin", binary), + os.path.join(base_dir, "third_party", "cuda", "bin", f"{binary}{exe}"), ] for p in paths: - bin = p.split(" ")[0] + if os.name != "nt": + bin = p.split(" ")[0] + else: + bin = p if os.path.exists(bin) and os.path.isfile(bin): result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT) if result is not None: version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: return p, version.group(1) - raise RuntimeError(f"Cannot find {binary}") + raise RuntimeError(f"Cannot find {binary}{exe}") @abstractclassmethod def supports_target(target: GPUTarget): diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py index 8ca1f8b326d03..bde0d3f3ceca4 100644 --- a/python/triton/compiler/compiler.py +++ b/python/triton/compiler/compiler.py @@ -15,6 +15,7 @@ import re import functools import os +import sysconfig # - ^\s*tt\.func\s+ : match the start of the string, any leading whitespace, the keyword func, # and any following whitespace @@ -151,7 +152,8 @@ def triton_key(): # backend libtriton_hash = hashlib.sha256() - with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f: + so_ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] + with open(os.path.join(TRITON_PATH, "_C", "libtriton." + so_ext), "rb") as f: while True: chunk = f.read(1024**2) if not chunk: diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py index 20da2bc25790b..0534dd0d28397 100644 --- a/python/triton/runtime/build.py +++ b/python/triton/runtime/build.py @@ -47,6 +47,7 @@ def _build(name, src, srcdir, library_dirs, include_dirs, libraries): cc_cmd += [f'-l{lib}' for lib in libraries] cc_cmd += [f"-L{dir}" for dir in library_dirs] cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None] + if os.name == "nt": cc_cmd.pop(cc_cmd.index("-fPIC")) ret = subprocess.check_call(cc_cmd) if ret == 0: return so diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py index 36e73d6b882d3..507715b3b4d8a 100644 --- a/third_party/nvidia/backend/compiler.py +++ b/third_party/nvidia/backend/compiler.py @@ -11,6 +11,7 @@ import signal import os import subprocess +import sysconfig from pathlib import Path @@ -20,9 +21,10 @@ def min_dot_size(target: GPUTarget): @functools.lru_cache() def _path_to_binary(binary: str): + exe = sysconfig.get_config_var("EXE") paths = [ os.environ.get(f"TRITON_{binary.upper()}_PATH", ""), - os.path.join(os.path.dirname(__file__), "bin", binary), + os.path.join(os.path.dirname(__file__), "bin", f"{binary}{exe}"), ] for bin in paths: @@ -32,7 +34,7 @@ def _path_to_binary(binary: str): version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: return bin, version.group(1) - raise RuntimeError(f"Cannot find {binary}") + raise RuntimeError(f"Cannot find {binary}{exe}") @functools.lru_cache() @@ -340,15 +342,9 @@ def make_cubin(src, metadata, opt, capability): ] try: subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog) - if os.path.exists(fsrc.name): - os.remove(fsrc.name) - if os.path.exists(flog.name): - os.remove(flog.name) except subprocess.CalledProcessError as e: with open(flog.name) as log_file: log = log_file.read() - if os.path.exists(flog.name): - os.remove(flog.name) if e.returncode == 255: error = 'Internal Triton PTX codegen error' @@ -365,6 +361,11 @@ def make_cubin(src, metadata, opt, capability): cubin = f.read() if os.path.exists(fbin): os.remove(fbin) + + if os.path.exists(fsrc.name): + os.remove(fsrc.name) + if os.path.exists(flog.name): + os.remove(flog.name) return cubin def add_stages(self, stages, options): diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py index c5f6419afca76..9c9057c637d6d 100644 --- a/third_party/nvidia/backend/driver.py +++ b/third_party/nvidia/backend/driver.py @@ -1,6 +1,7 @@ import functools import os import hashlib +import sysconfig import subprocess import tempfile from pathlib import Path @@ -14,12 +15,20 @@ libdevice_dir = os.path.join(dirname, "lib") libraries = ['cuda'] +if os.name == "nt": + include_dir += [os.path.join(os.environ.get("CUDA_PATH"), "include")] + @functools.lru_cache() def libcuda_dirs(): env_libcuda_path = os.getenv("TRITON_LIBCUDA_PATH") if env_libcuda_path: return [env_libcuda_path] + if os.name == "nt": + installed_base = sysconfig.get_config_var('installed_base') + dirs = [os.path.join(os.environ.get("CUDA_PATH"), "lib", "x64")] + dirs += [os.getenv("PYTHON_LIB_DIRS", os.path.join(installed_base, "libs"))] + return dirs libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() # each line looks like the following: @@ -48,7 +57,9 @@ def library_dirs(): def compile_module_from_src(src, name): key = hashlib.sha256(src.encode("utf-8")).hexdigest() cache = get_cache_manager(key) - cache_path = cache.get_file(f"{name}.so") + so_ext = sysconfig.get_config_var("EXT_SUFFIX").split(".")[-1] + so_name = f'{name}.{so_ext}' + cache_path = cache.get_file(so_name) if cache_path is None: with tempfile.TemporaryDirectory() as tmpdir: src_path = os.path.join(tmpdir, "main.c") @@ -56,7 +67,7 @@ def compile_module_from_src(src, name): f.write(src) so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries) with open(so, "rb") as f: - cache_path = cache.put(f.read(), f"{name}.so", binary=True) + cache_path = cache.put(f.read(), so_name, binary=True) import importlib.util spec = importlib.util.spec_from_file_location(name, cache_path) mod = importlib.util.module_from_spec(spec)