Merge branch 'main' into pauth-signed-got-relocs

llvm · Jul 1, 2024 · 7d47db8 · 7d47db8
2 parents 8603078 + 8e8c455
commit 7d47db8
Show file tree

Hide file tree

Showing 4,093 changed files with 212,228 additions and 102,410 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -64,8 +64,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/Dialect/*/Transforms/Bufferize.cpp @matthias-springer
 
 # Linalg Dialect in MLIR.
-/mlir/include/mlir/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
-/mlir/lib/Dialect/Linalg/* @dcaballe @nicolasvasilache @rengolin
+/mlir/include/mlir/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin
+/mlir/lib/Dialect/Linalg @dcaballe @nicolasvasilache @rengolin
 /mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache
@@ -85,8 +85,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/*VectorToSCF* @banach-space @dcaballe @matthias-springer @nicolasvasilache
 /mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*X86Vector* @aartbik @dcaballe @nicolasvasilache
-/mlir/include/mlir/Dialect/Vector/* @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Vector/* @dcaballe @nicolasvasilache
+/mlir/include/mlir/Dialect/Vector @dcaballe @nicolasvasilache
+/mlir/lib/Dialect/Vector @dcaballe @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/**/*EmulateNarrowType* @dcaballe @hanhanW
@@ -120,6 +120,9 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/LLVMIR/**/BasicPtxBuilderInterface* @grypp
 /mlir/**/NVVM* @grypp
 
+# MLIR Index Dialect
+/mlir/**/Index* @mogball
+
 # MLIR Python Bindings
 /mlir/test/python/ @ftynse @makslevental @stellaraccident
 /mlir/python/ @ftynse @makslevental @stellaraccident
@@ -141,3 +144,8 @@ clang/test/AST/Interp/ @tbaederr
 
 # ExtractAPI
 /clang/**/ExtractAPI @daniel-grumberg
+
+# DWARFLinker, dwarfutil, dsymutil
+/llvm/**/DWARFLinker/ @JDevlieghere
+/llvm/**/dsymutil/ @JDevlieghere
+/llvm/**/llvm-dwarfutil/ @JDevlieghere
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
@@ -5,6 +5,7 @@ on:
     workflows:
       - "Check code formatting"
       - "Check for private emails used in PRs"
+      - "PR Request Release Note"
     types:
       - completed
 
@@ -92,7 +93,11 @@ jobs:
 
             var pr_number = 0;
             gql_result.repository.ref.associatedPullRequests.nodes.forEach((pr) => {
-              if (pr.baseRepository.owner.login = context.repo.owner && pr.state == 'OPEN') {
+
+              // The largest PR number is the one we care about.  The only way
+              // to have more than one associated pull requests is if all the
+              // old pull requests are in the closed state.
+              if (pr.baseRepository.owner.login = context.repo.owner && pr.number > pr_number) {
                 pr_number = pr.number;
               }
             });

diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
@@ -2,7 +2,6 @@ name: PR Request Release Note
 
 permissions:
   contents: read
-  pull-requests: write
 
 on:
   pull_request:
@@ -41,3 +40,10 @@ jobs:
             --token "$GITHUB_TOKEN" \
             request-release-note \
             --pr-number ${{ github.event.pull_request.number}}
+
+      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        if: always()
+        with:
+          name: workflow-args
+          path: |
+            comments
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
@@ -56,6 +56,14 @@
 
   Allow processing of stripped binaries
 
+- `--alt-inst-feature-size=<uint>`
+
+  Size of feature field in .altinstructions
+
+- `--alt-inst-has-padlen`
+
+  Specify that .altinstructions has padlen field
+
 - `--asm-dump[=<dump folder>]`
 
   Dump function into assembly
@@ -78,6 +86,16 @@
   in the input is decoded and re-encoded. If the resulting bytes do not match
   the input, a warning message is printed.
 
+- `--comp-dir-override=<string>`
+
+  Overrides DW_AT_comp_dir, and provides an alterantive base location, which is
+  used with DW_AT_dwo_name to construct a path to *.dwo files.
+
+- `--create-debug-names-section`
+
+  Creates .debug_names section, if the input binary doesn't have it already, for
+  DWARF5 CU/TUs.
+
 - `--cu-processing-batch-size=<uint>`
 
   Specifies the size of batches for processing CUs. Higher number has better
@@ -93,7 +111,7 @@
 
 - `--debug-skeleton-cu`
 
-  Prints out offsetrs for abbrev and debu_info of Skeleton CUs that get patched.
+  Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
 - `--deterministic-debuginfo`
 
@@ -104,6 +122,10 @@
 
   Add basic block instructions as tool tips on nodes
 
+- `--dump-alt-instructions`
+
+  Dump Linux alternative instructions info
+
 - `--dump-cg=<string>`
 
   Dump callgraph to the given file
@@ -117,10 +139,34 @@
   Dump function CFGs to graphviz format after each stage;enable '-print-loops'
   for color-coded blocks
 
+- `--dump-linux-exceptions`
+
+  Dump Linux kernel exception table
+
 - `--dump-orc`
 
   Dump raw ORC unwind information (sorted)
 
+- `--dump-para-sites`
+
+  Dump Linux kernel paravitual patch sites
+
+- `--dump-pci-fixups`
+
+  Dump Linux kernel PCI fixup table
+
+- `--dump-smp-locks`
+
+  Dump Linux kernel SMP locks
+
+- `--dump-static-calls`
+
+  Dump Linux kernel static calls
+
+- `--dump-static-keys`
+
+  Dump Linux kernel static keys jump table
+
 - `--dwarf-output-path=<string>`
 
   Path to where .dwo files or dwp file will be written out to.
@@ -205,6 +251,18 @@
 
   Skip processing of cold functions
 
+- `--log-file=<string>`
+
+  Redirect journaling to a file instead of stdout/stderr
+
+- `--long-jump-labels`
+
+  Always use long jumps/nops for Linux kernel static keys
+
+- `--match-profile-with-function-hash`
+
+  Match profile with function hash
+
 - `--max-data-relocations=<uint>`
 
   Maximum number of data relocations to process
@@ -274,6 +332,10 @@
 
   Number of tasks to be created per thread
 
+- `--terminal-trap`
+
+  Assume that execution stops at trap instruction
+
 - `--thread-count=<uint>`
 
   Number of threads
@@ -618,10 +680,6 @@
   threshold means fewer functions to process. E.g threshold of 90 means only top
   10 percent of functions with profile will be processed.
 
-- `--mcf-use-rarcs`
-
-  In MCF, consider the possibility of cancelling flow to balance edges
-
 - `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
 
   List of functions with call sites for which to specialize memcpy() for size 1
@@ -710,7 +768,7 @@
   - `none`: do not reorder functions
   - `exec-count`: order by execution count
   - `hfsort`: use hfsort algorithm
-  - `hfsort+`: use hfsort+ algorithm
+  - `hfsort+`: use cache-directed sort
   - `cdsort`: use cache-directed sort
   - `pettis-hansen`: use Pettis-Hansen algorithm
   - `random`: reorder functions randomly
@@ -804,8 +862,8 @@
 
 - `--stale-matching-min-matched-block=<uint>`
 
-  Minimum percent of exact match block for a function to be considered for
-  profile inference.
+  Percentage threshold of matched basic blocks at which stale profile inference
+  is executed.
 
 - `--stale-threshold=<uint>`
 
@@ -853,6 +911,10 @@
 
   Only apply branch boundary alignment in hot code
 
+- `--x86-strip-redundant-address-size`
+
+  Remove redundant Address-Size override prefix
+
 ### BOLT options in relocation mode:
 
 - `--align-macro-fusion=<value>`
@@ -1039,6 +1101,10 @@
 
   Print clusters
 
+- `--print-estimate-edge-counts`
+
+  Print function after edge counts are set for no-LBR profile
+
 - `--print-finalized`
 
   Print function after CFG is finalized
@@ -1071,6 +1137,10 @@
 
   Print functions after inlining optimization
 
+- `--print-large-functions`
+
+  Print functions that could not be overwritten due to excessive size
+
 - `--print-longjmp`
 
   Print functions after longjmp pass
@@ -1166,4 +1236,4 @@
 
 - `--print-options`
 
-  Print non-default options after command line parsing
+  Print non-default options after command line parsing
diff --git a/bolt/docs/OptimizingLinux.md b/bolt/docs/OptimizingLinux.md
@@ -0,0 +1,120 @@
+# Optimizing Linux Kernel with BOLT
+
+
+## Introduction
+
+Many Linux applications spend a significant amount of their execution time in the kernel. Thus, when we consider code optimization for system performance, it is essential to improve the CPU utilization not only in the user-space applications and libraries but also in the kernel. BOLT has demonstrated double-digit gains while being applied to user-space programs. This guide shows how to apply BOLT to the x86-64 Linux kernel and enhance your system's performance. In our experiments, BOLT boosted database TPS by 2 percent when applied to the kernel compiled with the highest level optimizations, including PGO and LTO. The database spent ~40% of the time in the kernel and was quite sensitive to kernel performance.
+
+BOLT optimizes code layout based on a low-level execution profile collected with the Linux `perf` tool. The best quality profile should include branch history, such as Intel's last branch records (LBR). BOLT runs on a linked binary and reorders the code while combining frequently executed blocks of instructions in a manner best suited for the hardware. Other than branch instructions, most of the code is left unchanged. Additionally, BOLT updates all metadata associated with the modified code, including DWARF debug information and Linux ORC unwind information.
+
+While BOLT optimizations are not specific to the Linux kernel, certain quirks distinguish the kernel from user-level applications.
+
+BOLT has been successfully applied to and tested with several flavors of the x86-64 Linux kernel.
+
+
+## QuickStart Guide
+
+BOLT operates on a statically-linked kernel executable, a.k.a. `vmlinux` binary. However, most Linux distributions use a `vmlinuz` compressed image for system booting. To use BOLT on the kernel, you must either repackage `vmlinuz` after BOLT optimizations or add steps for running BOLT into the kernel build and rebuild `vmlinuz`. Uncompressing `vmlinuz` and repackaging it with a new `vmlinux` binary falls beyond the scope of this guide, and at some point, we may add the capability to run BOLT directly on `vmlinuz`. Meanwhile, this guide focuses on steps for integrating BOLT into the kernel build process.
+
+
+### Building the Kernel
+
+After downloading the kernel sources and configuration for your distribution, you should be able to build `vmlinuz` using the `make bzImage` command. Ideally, the kernel should binary match the kernel on the system you are about to optimize (the target system). The binary matching part is critical as BOLT performs profile matching and optimizations at the binary level. We recommend installing a freshly built kernel on the target system to avoid any discrepancies.
+
+Note that the kernel build will produce several artifacts besides bzImage. The most important of them is the uncompressed `vmlinux` binary, which will be used in the next steps. Make sure to save this file.
+
+Build and target systems should have a `perf` tool installed for collecting and processing profiles. If your build system differs from the target, make sure `perf` versions are compatible. The build system should also have the latest BOLT binary and tools (`llvm-bolt`, `perf2bolt`).
+
+Once the target system boots with the freshly-built kernel, start your workload, such as a database benchmark. While the system is under load, collect the kernel profile using perf:
+
+
+```bash
+$ sudo perf record -a -e cycles -j any,k -F 5000 -- sleep 600
+```
+
+
+Convert `perf` profile into a format suitable for BOLT passing the `vmlinux` binary to `perf2bolt`:
+
+
+```bash
+$ sudo chwon $USER perf.data
+$ perf2bolt -p perf.data -o perf.fdata vmlinux
+```
+
+
+Under a high load, `perf.data` should be several gigabytes in size and you should expect the converted `perf.fdata` not to exceed 100 MB.
+
+Two changes are required for the kernel build. The first one is optional but highly recommended. It introduces a BOLT-reserved space into `vmlinux` code section:
+
+
+```diff
+--- a/arch/x86/kernel/vmlinux.lds.S
++++ b/arch/x86/kernel/vmlinux.lds.S
+@@ -139,6 +139,11 @@ SECTIONS
+                STATIC_CALL_TEXT
+                *(.gnu.warning)
+
++    /* Allocate space for BOLT */
++    __bolt_reserved_start = .;
++               . += 2048 * 1024;
++    __bolt_reserved_end = .;
++
+ #ifdef CONFIG_RETPOLINE
+                __indirect_thunk_start = .;
+                *(.text.__x86.*)
+```
+
+
+The second patch adds a step that runs BOLT on `vmlinux` binary:
+
+
+```diff
+--- a/scripts/link-vmlinux.sh
++++ b/scripts/link-vmlinux.sh
+@@ -340,5 +340,13 @@ if is_enabled CONFIG_KALLSYMS; then
+        fi
+ fi
+
++# Apply BOLT
++BOLT=llvm-bolt
++BOLT_PROFILE=perf.fdata
++BOLT_OPTS="--dyno-stats --eliminate-unreachable=0 --reorder-blocks=ext-tsp --simplify-conditional-tail-calls=0 --skip-funcs=__entry_text_start,irq_entries_start --split-functions"
++mv vmlinux vmlinux.pre-bolt
++echo BOLTing vmlinux
++${BOLT} vmlinux.pre-bolt -o vmlinux --data ${BOLT_PROFILE} ${BOLT_OPTS}
++
+ # For fixdep
+ echo "vmlinux: $0" > .vmlinux.d
+```
+
+
+If you skipped the first step or are running BOLT on a pre-built `vmlinux` binary, drop the `--split-functions` option.
+
+
+## Performance Expectations
+
+By improving the code layout, BOLT can boost the kernel's performance by up to 5% by reducing instruction cache misses and branch mispredictions. When measuring total system performance, you should scale this number accordingly based on the time your application spends in the kernel (excluding I/O time).
+
+
+## Profile Quality
+
+The timing and duration of the profiling may have a significant effect on the performance of the BOLTed kernel. If you don't know your workload well, it's recommended that you profile for the whole duration of the benchmark run. As longer times will result in larger `perf.data` files, you can lower the profiling frequency by providing a smaller value of `-F` flag. E.g., to record the kernel profile for half an hour, use the following command:
+
+
+```bash
+$ sudo perf record -a -e cycles -j any,k -F 1000 -- sleep 1800
+```
+
+
+
+## BOLT Disassembly
+
+BOLT annotates the disassembly with control-flow information and attaches Linux-specific metadata to the code. To view annotated disassembly, run:
+
+
+```bash
+$ llvm-bolt vmlinux -o /dev/null --print-cfg
+```
+
+
+If you want to limit the disassembly to a set of functions, add `--print-only=<func1regex>,<func2regex>,...`, where a function name is specified using regular expressions.