support nvcc host compiler (facebookincubator#898)

Summary: Add `AIT_NVCC_CCBIN` env variable to set nvcc host compiler Minor fixes: - document `AIT_ENABLE_CUDA_LTO` - eliminate an empty space caused by debug options Pull Request resolved: facebookincubator#898 Reviewed By: sgrigory Differential Revision: D48435098 Pulled By: aakhundov fbshipit-source-id: 80160f9258fdd62a1df83115162f84c5db96b15c
henrylhtsang · Aug 20, 2023 · b5841ab · b5841ab
1 parent 34340fb
commit b5841ab
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 9 deletions.
diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
@@ -14,6 +14,10 @@ Codegen
 
 **AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler perform time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
 
+**AIT_NVCC_CCBIN**: nvcc host compiler (ccbin).
+
+**AIT_ENABLE_CUDA_LTO**: If set to "1", nvcc will use LTO flags during compilation. Default value is "0".
+
 **AIT_TIME_COMPILATION**: If set to "1", time each make command at the compilation time. This helps us to do compilation time analysis. Requires to install `time <https://man7.org/linux/man-pages/man1/time.1.html>`_ package.
 
 **AIT_MULTISTREAM_MODE**: Controls multi-stream mode. Default mode is "0".

diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
@@ -844,14 +844,19 @@ def _gen_compiler_version_files(self, target_dir):
         # for cache invalidation purposes (different compiler versions
         # should not reuse same cached build artifacts )
         cc = Target.current().cc()
-        compilers = {"main_compiler": cc}
+        compilers = {}
         if "nvcc" in cc:
-            ccbin_match = re.search(r'-ccbin "?([^ "]+)', cc)
+            # extract the part before " -ccbin " as group #1
+            # and the content of the quoted expression (until
+            # the first space) after " -ccbin " as group #2
+            ccbin_match = re.search(r'(.*) -ccbin "?([^ "]+)', cc)
             if ccbin_match:
-                nvcc_host_compiler = ccbin_match.group(1)
+                cc = ccbin_match.group(1)
+                nvcc_host_compiler = ccbin_match.group(2)
             else:
                 nvcc_host_compiler = "g++"  # default, using PATH resolution
             compilers["nvcc_host_compiler"] = nvcc_host_compiler
+        compilers["main_compiler"] = cc
 
         # Write compiler version string(s)
         # into the build directory, to enable using them for cache hash determination

diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
@@ -135,16 +135,16 @@ def _build_gnu_host_compiler_options(self) -> List[str]:
     def get_host_compiler_options(self) -> List[str]:
         return self._build_gnu_host_compiler_options()
 
-    def _get_nvcc_debug_options(self) -> str:
-        CUDA_DEBUG_LEVEL_STRINGS = ["", "-lineinfo", "-g -G"]
+    def _get_nvcc_debug_options(self) -> List[str]:
+        CUDA_DEBUG_LEVEL_STRINGS = [[], ["-lineinfo"], ["-g", "-G"]]
         level = environ.get_cuda_nvcc_debug_level()
         if level.isdigit():
             level = int(level)
             assert (
                 level >= 0 and level < 3
             ), "Debug level out of range. Must be 0 (no debug info), 1 (lineinfo) or 2 (with debug info, disable opt)"
             return CUDA_DEBUG_LEVEL_STRINGS[level]
-        return level
+        return [level]
 
     def _build_nvcc_compiler_options(self) -> List[str]:
         code = [f"sm_{self._arch}", f"compute_{self._arch}"]
@@ -169,7 +169,7 @@ def _build_nvcc_compiler_options(self) -> List[str]:
                     "--source-in-ptx",
                 ]
             ),  # Annotate the ptx file with source information
-        options.append(self._get_nvcc_debug_options())
+        options.extend(self._get_nvcc_debug_options())
         if self._ndebug == 1:
             options.append("-DNDEBUG")
         if environ.use_fast_math() and (
@@ -242,7 +242,10 @@ def __exit__(self, ptype, value, trace):
             shutil.rmtree(self.lib_folder)
 
     def cc(self):
-        return "nvcc"
+        cc = "nvcc"
+        if environ.nvcc_ccbin():
+            cc += " -ccbin " + environ.nvcc_ccbin()
+        return cc
 
     def compile_cmd(self, executable=False):
         if executable:
@@ -443,7 +446,7 @@ def _build_compile_options(self):
                         "--source-in-ptx",  # Annotate the ptx file with source information
                     ]
                 ),
-            options.append(self._get_nvcc_debug_options())
+            options.extend(self._get_nvcc_debug_options())
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
             FBCUDA.static_compile_options_ = options

diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
@@ -63,6 +63,13 @@ def enable_cuda_lto() -> bool:
     return os.getenv("AIT_ENABLE_CUDA_LTO", "0") == "1"
 
 
+def nvcc_ccbin() -> str:
+    """
+    nvcc host compiler (ccbin)
+    """
+    return os.getenv("AIT_NVCC_CCBIN", "")
+
+
 def force_profiler_cache() -> bool:
     """
     Force the profiler to use the cached results. The profiler will throw