Micro optimization for softmax_forward_kernel5
#366
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Unit, Static and other Tests | |
on: | |
create: | |
workflow_dispatch: | |
push: | |
branches: | |
- master | |
pull_request: | |
branches: | |
- master | |
jobs: | |
dataloader_test: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: test the dataloader without / with sanitize address | |
run: | | |
cd dev/test | |
make PRECISION=BF16 test_dataloader | |
./test_dataloader | |
make clean | |
make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader | |
./test_dataloader | |
ptx_and_sass_files: | |
runs-on: ubuntu-latest | |
container: | |
image: nvidia/cuda:12.4.1-devel-ubuntu22.04 | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install OpenMP and OpenMPI | |
run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev | |
- name: Generate ptx/sass files and upload them to persistent storage | |
run: | | |
mkdir -p dev/cuda/ptx_sass_logs | |
make train_gpt2cu | |
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | |
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | |
cd dev/cuda | |
make -j all_ptx | |
make -j all_sass | |
cp *.ptx ptx_sass_logs/ | |
cp *.sass ptx_sass_logs/ | |
ls ptx_sass_logs/ | |
- name: Generate ptx/sass files for A100 and upload them to persistent storage | |
run: | | |
mkdir -p dev/cuda/ptx_sass_logs_A100 | |
make train_gpt2cu GPU_COMPUTE_CAPABILITY=80 | |
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | |
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | |
cd dev/cuda | |
make -j GPU_COMPUTE_CAPABILITY=80 all_ptx | |
make -j GPU_COMPUTE_CAPABILITY=80 all_sass | |
cp *.ptx ptx_sass_logs_A100/ | |
cp *.sass ptx_sass_logs_A100/ | |
ls ptx_sass_logs_A100/ | |
- name: Generate ptx/sass files for H100 and upload them to persistent storage | |
run: | | |
mkdir -p dev/cuda/ptx_sass_logs_H100 | |
make train_gpt2cu GPU_COMPUTE_CAPABILITY=90 | |
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | |
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | |
cd dev/cuda | |
make -j GPU_COMPUTE_CAPABILITY=90 all_ptx | |
make -j GPU_COMPUTE_CAPABILITY=90 all_sass | |
cp *.ptx ptx_sass_logs_H100/ | |
cp *.sass ptx_sass_logs_H100/ | |
ls ptx_sass_logs_H100/ | |
- name: Upload ptx/sass files | |
uses: actions/upload-artifact@v4 | |
with: | |
name: ptx_sass_files | |
path: dev/cuda/ptx_sass_logs/ | |
retention-days: 30 # days to retain | |
- name: Upload ptx/sass files for A100 | |
uses: actions/upload-artifact@v4 | |
with: | |
name: ptx_sass_files_A100 | |
path: dev/cuda/ptx_sass_logs_A100/ | |
retention-days: 30 # days to retain | |
- name: Upload ptx/sass files for H100 | |
uses: actions/upload-artifact@v4 | |
with: | |
name: ptx_sass_files_H100 | |
path: dev/cuda/ptx_sass_logs_H100/ | |
retention-days: 30 # days to retain |