Skip to content

Commit

Permalink
Merge branch 'main_perf' into main_perf-rmsnorm
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbatra85 authored Sep 6, 2024
2 parents f80aed7 + 3704738 commit 9da4278
Show file tree
Hide file tree
Showing 13 changed files with 1,594 additions and 0 deletions.
14 changes: 14 additions & 0 deletions python/perf-kernels/tools/amdgcn-cfg/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Control Flow Graph Generator from AMDGCN assembly

The script reads an assembly file and generates a Control Flow Graph (CFG) for each function in the file. The graph can be saved in `dot`, `svg` and `pdf` formats. The nodes of a graph can be represented with 1) just labels or 2) the corresponding assembly code. The edges of a graph can help to identify cycles and, thus, to provide a better navigation through the code.


### Basic usage

```
python ./amdgcn-cfg.py -i <path to assembly file> -o <output directory>/<output prefix> -f [dot|svg|pdf]
```

`dot`-files can be visualize with [this](https://dreampuf.github.io/GraphvizOnline) online tool. You just need to copy and paste the content of a generated `dot`-file.

By default, the nodes are named with basic block labels. Use `-v` or `--verbose` option to add assembly source code to corresponding nodes.
222 changes: 222 additions & 0 deletions python/perf-kernels/tools/amdgcn-cfg/amdgcn-cfg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import os
import argparse
import re
from collections import OrderedDict
import graphviz


class Options:

def __init__(self, input_file, output_file, verbose, format):
if not os.path.exists(input_file):
raise RuntimeError('input file is not provided')

output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
raise RuntimeError('output directory does not exist')

self.input_file = input_file
self.output_file = output_file
self.verbose = verbose
self.format = format
self.output_dir = output_dir


class Block:

def __init__(self, label, code):
self.label = label
self.code = code
self.edges = []


class Kernel:

def __init__(self, kernel_name, blocks):
self.name = kernel_name
self.blocks = blocks
self.cfg = None


begin_label = 'Begin'
end_label = 'End'


def find_kernel(text):
func_name_expr = r'^([^\s^\.]\w.+):'
func_name = None
start = None
for index, line in enumerate(text):
match = re.search(func_name_expr, line)
if match is not None:
func_name = match[1]
start = index
break
if start is None:
return None, None, None

end = None
for index, line in enumerate(text):
if re.search(r's_endpgm', line) is not None:
end = index
break

if end is None:
return None, None, None

return func_name, text[start:end + 1], end


def find_label(kernel):
label = None
index = None
for index, line in enumerate(kernel):
match = re.search(r'^\.(\w+):', line)
if match is not None:
label = match[1]
break
return label, index


def get_block_list(kernel):
label, index = find_label(kernel)

blocks = OrderedDict()
if (index > 1):
blocks[begin_label] = Block(begin_label, kernel[:index - 1])

while label is not None:
kernel = kernel[index + 1:]
next_label, next_index = find_label(kernel)
if next_label is None:
code = kernel[index:]
else:
code = kernel[:next_index]
blocks[label] = Block(label, code)

label = next_label
index = next_index

blocks[end_label] = Block(end_label, [])

return blocks


def find_terminators(code):
terminator_labels = []
for line in code:
branch = re.search(r'(c)?branch.*\s+\.?(.*)', line)
if branch is not None:
is_condional = True if len(branch.groups()) == 2 else False
label_idx = 2 if is_condional else 1
terminator_labels.append(branch[label_idx])
if not is_condional:
return terminator_labels, True
end = re.search(r's_endpgm', line)
if end is not None:
terminator_labels.append(end_label)
return terminator_labels, True

return terminator_labels, False


def add_edges(kernel):
keys = list(kernel.blocks.keys())
for index, curr_label in enumerate(keys):
if curr_label == end_label:
continue

code = kernel.blocks[curr_label].code
terminators, is_last_unconditional = find_terminators(code[:-1])

if is_last_unconditional:
# unconditional jump in the middle of the block
break

# handle the last terminator in the current BB
last_terminator, is_unconditional = find_terminators([code[-1]])

is_conditional = not is_unconditional
next_block_label = keys[index + 1]
is_next_covered = next_block_label in terminators

if last_terminator:
terminators.extend(last_terminator)
if is_conditional and not is_next_covered:
next_block_label = keys[index + 1]
terminators.append(next_block_label)
else:
if not is_next_covered:
next_block_label = keys[index + 1]
terminators.append(next_block_label)

assert (len(terminators))
kernel.blocks[curr_label].edges = terminators


def generate_cfg(kernel, options):
graph = graphviz.Digraph(f'{kernel.name}')
for curr_label in kernel.blocks:
block = kernel.blocks[curr_label]
asm = [line.strip() for line in block.code]
if options.verbose:
label_text = repr('\n'.join([f'{curr_label}', *asm]))
else:
label_text = curr_label
graph.node(curr_label, shape='rect', labeljust='l', margin='0.01', label=label_text)

for curr_label in kernel.blocks:
block = kernel.blocks[curr_label]
for edge in block.edges:
graph.edge(curr_label, edge)

return graph


def main(options):
asm = []
with open(options.input_file, 'r') as file:
context = file.readlines()
for line in context:
asm.append(line[:-1])

kernels = []
last_end_index = 0
while last_end_index is not None:
func_name, kernel_asm, last_end_index = find_kernel(asm)
if kernel_asm is None:
break

blocks = get_block_list(kernel_asm)
kernel = Kernel(func_name, blocks)
add_edges(kernel)

cfg = generate_cfg(kernel, options)
kernel.cfg = cfg
kernels.append(kernel)
asm = asm[last_end_index + 1:]

for index, kernel in enumerate(kernels):
output_file_name = f'{options.output_file}.kernel-{index}'
if options.format == 'dot':
with open(f'{output_file_name}.dot', 'w') as file:
file.write(str(kernel.cfg))
file.write('\n')
else:
kernel.cfg.render(
filename=f'{output_file_name}',
format=options.format,
).replace('\\', '/')


if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="Generates Control Flow Graph (CFG) from amdgcn assembly file", )
parser.add_argument("-i", "--input", type=str, default=None, help="input file")
parser.add_argument("-o", "--output", type=str, default=None, help="output file prefix")
parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
parser.add_argument("-f", "--format", choices=['dot', 'svg', 'pdf'], default="dot", help="output format type")
args = parser.parse_args()

options = Options(args.input, args.output, args.verbose, args.format)

main(options)
71 changes: 71 additions & 0 deletions python/perf-kernels/tools/occ.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#! /bin/bash

## $1: input script that contains one kernel

rm -rf ~/.triton/cache/

export MLIR_ENABLE_DUMP=1
export AMDGCN_ENABLE_DUMP=1
## Assume CDNA arch
SIMD=4
LDS_SIZE=65536
TOTAL_VGPR=512

get_occ_per_CU() {
## $1: vgpr count
vgpr=$1
occPerEU=$((TOTAL_VGPR/vgpr))
if [[ $vgpr -gt 256 ]]; then
occPerEU=1
elif [[ $vgpr -gt 168 ]]; then
occPerEU=2
elif [[ $vgpr -gt 128 ]]; then
occPerEU=3
elif [[ $vgpr -gt 96 ]]; then
occPerEU=4
elif [[ $vgpr -gt 80 ]]; then
occPerEU=5
elif [[ $vgpr -gt 72 ]]; then
occPerEU=6
elif [[ $vgpr -gt 64 ]]; then
occPerEU=7
else
occPerEU=8
fi

occPerCU=$((occPerEU*SIMD/num_warps))
echo $occPerCU
}

$1 > output.mlir 2>&1

LDS_line=$(sed -n '/triton_gpu\.shared\ /p' output.mlir | tail -n 1 | grep -o 'triton_gpu.shared = [0-9]*')
numWarps_line=$(sed -n '/triton_gpu\.num-warps/p' output.mlir | tail -n 1 | grep -o 'triton_gpu.num-warps. = [0-9]*')

LDS=${LDS_line##*=}
num_warps=${numWarps_line##*=}
echo "LDS: $LDS, num_warps: $num_warps"

VGPRs=$(sed -n '/vgpr_count/p' output.mlir | tail -n 1 | awk '{print $2}')
SPILLs=$(sed -n '/vgpr_spill/p' output.mlir | tail -n 1 | awk '{print $2}')

echo "VGPRS: $VGPRs (spill: $SPILLs)"

occLDSPerCU=$((LDS_SIZE/LDS))
occVgprPerCU=$(get_occ_per_CU $VGPRs)
occPerCU=$occVgprPerCU
if [ $occLDSPerCU -lt $occVgprPerCU ];then
occPerCU=$occLDSPerCU
fi
occPerEU=$((occPerCU*num_warps/SIMD))
echo "occupancy: $occPerEU waves/SIMD or $occPerCU workgroups/CU (occLDSPerCU: $occLDSPerCU, occVgprPerCU: $occVgprPerCU)"

perf=$(tail -n 2 output.mlir)
echo "$perf"

## remove distracting info from the assembly
sed -i '/local_/! {/\.loc/d}' output.mlir
sed -i '/\.Ltmp.*:/d' output.mlir
sed -i '/AMD clang version/d' output.mlir

sed -n '/AMDGCN/, $p' output.mlir > output.amdgcn
Loading

0 comments on commit 9da4278

Please sign in to comment.