Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Slack message to ci channel tagging owners on flakes. #12284

Merged
merged 15 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
matrix:
# Only run arm64 build with arm64-ci label or on master.
# The way to do conditions here is to parse full strings as JSON.
settings: >-
settings: >-
${{ fromJson(
(contains(github.event.pull_request.labels.*.name, 'arm64-ci') || github.ref_name == 'master') &&
'[{"arch":"amd64"},{"arch":"arm64"}]' ||
Expand Down Expand Up @@ -76,6 +76,7 @@ jobs:
NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
run: |
./ci.sh ec2

Expand Down Expand Up @@ -170,6 +171,7 @@ jobs:
INSTANCE_POSTFIX: ${{ matrix.number }}
DRY_RUN: 1
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
run: |
./ci.sh ec2-test

Expand Down
139 changes: 139 additions & 0 deletions .test_patterns.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Describes tests that we either:
# - Outright skip.
# - Run and alert owners on failure.
# To outright skip a test add a "skip: true" property.
# Only do this for tests that are currently expected to fail constantly and the noise is unbearable!
# Otherwise, tests must have owner(s). Owners are identified by their Slack id, (profile, three dots, copy member id).
# If there is no owner for failed test the build will fail, regardless of if it matches in this file.
# When a failing test matches one or more of the "regex" properties below,
# a message is sent to slack channel #aztec3-ci tagging the owners.

tests:
# barretenberg
#
# Rare. But I saw it happen twice in 10 CI runs. Then twice in 10000 mainframe runs. Today I can't reproduce.
# Grind with: seq 1 10000 | parallel --bar "barretenberg/cpp/scripts/run_test.sh join_split_example_tests join_split_tests.test_defi_deposit_second_bridge_output_in_use_and_same_virtual_bridge_output_asset_ids >/dev/null"
# Logic failed: field_t::range_constraint
# /home/aztec-dev/aztec-packages/barretenberg/cpp/src/barretenberg/examples/join_split/join_split.test.cpp:1735: Failure
# Value of: result.valid
# Actual: false
# Expected: true
- regex: "join_split_example_tests"
owners:
- "U03JYU7AQET" # luke

# noir
# Something to do with how I run the tests now. Think these are fine in nextest.
- regex: "noir_lsp-.* notifications::notification_tests::test_caches_open_files"
skip: true
owners:
- "UKUMA5J7K" # charlie
- regex: "noir_lsp-.* requests::"
skip: true
owners:
- "UKUMA5J7K" # charlie
# Sometimes see this on ARM. But not when run on it's own...
# FAILED 6a60c4e796ac0aef: noir/scripts/run_test.sh debug-21ff1948430ded06 tests::debug_ram_blowup_regression (code: 101)
# running 1 test
# test tests::debug_ram_blowup_regression has been running for over 60 seconds test tests::debug_ram_blowup_regression ... FAILED
# failures:
# ---- tests::debug_ram_blowup_regression stdout ----
# thread 'tests::debug_ram_blowup_regression' panicked at tooling/debugger/tests/debug.rs:27:14: Could not start debugger: Timeout { expected: "Regex: \".*\Starting debugger.*\"", got: "`^`[?2004l`\r``\r``\n`
# Waiting for lock on Nargo.toml...`\r``\n` ", timeout: 30s }
# note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
# failures:
# tests::debug_ram_blowup_regression
# test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 232 filtered out; finished in 60.34s
- regex: "tests::debug_ram_blowup_regression"
skip: true
owners:
- "UKUMA5J7K" # charlie

# Seen this error on all the below.
# e.g. to grind: seq 1 16 | parallel --bar --tag --halt now,fail=1 ci3/dump_fail "NAME_POSTFIX=_{} yarn-project/end-to-end/scripts/run_test.sh simple e2e_p2p/gossip_network >/dev/null"
# FAIL e2e_p2p/reqresp.test.ts
# ● e2e_p2p_reqresp_tx › should produce an attestation by requesting tx data over the p2p network

# TypeError: Cannot read properties of null (reading 'address')

# 19 |
# 20 | get(): T | undefined {
# > 21 | return this.#db.get(this.#slot);
# | ^
# 22 | }
# 23 |
# 24 | getAsync(): Promise<T | undefined> {

# at LMDBStore.getBinaryFast (../../node_modules/lmdb/read.js:90:9)
# at LMDBStore.get (../../node_modules/lmdb/read.js:334:22)
# at LmdbAztecSingleton.get (../../kv-store/src/lmdb/singleton.ts:21:21)
# at initStoreForRollup (../../kv-store/src/utils.ts:26:82)
# at createStore (../../kv-store/src/lmdb/index.ts:25:12)
# at createArchiver (../../archiver/src/factory.ts:30:25)
# at Function.createAndSync (../../aztec-node/src/aztec-node/server.ts:157:28)
# at createAndSync (fixtures/setup_p2p_test.ts:72:33)
# at async Promise.all (index 0)
# at Object.<anonymous> (e2e_p2p/reqresp.test.ts:66:13)
- regex: "simple e2e_p2p/"
owners:
- "U04DT239VQU" # sean

# FAIL ./flakey_e2e_inclusion_proofs_contract.test.ts
# ● e2e_inclusion_proofs_contract › contract inclusion › proves public deployment of a contract
#
# Undefined argument value of type field
#
# 41 | private encodeArgument(abiType: AbiType, arg: any, name?: string) {
# 42 | if (arg === undefined || arg == null) {
# > 43 | throw new Error(`Undefined argument ${name ?? 'unnamed'} of type ${abiType.kind}`);
# | ^
# 44 | }
# 45 | switch (abiType.kind) {
# 46 | case 'field':
#
# at ArgumentEncoder.encodeArgument (../../stdlib/src/abi/encoder.ts:43:13)
# at ArgumentEncoder.encode (../../stdlib/src/abi/encoder.ts:137:12)
# at encodeArguments (../../stdlib/src/abi/encoder.ts:150:41)
# at computeInitializationHash (../../stdlib/src/contract/contract_address.ts:75:20)
# at getContractInstanceFromDeployParams (../../stdlib/src/contract/contract_instance.ts:124:9)
# at Object.getContractInstanceFromDeployParams (flakey_e2e_inclusion_proofs_contract.test.ts:275:24)
- regex: "simple flakey_e2e_inclusion_proofs_contract"
owners:
- "UKUMA5J7K" # charlie

# FAIL src/test/bb_prover_parity.test.ts (34.059 s)
# prover/bb_prover/parity
# ✕ proves the parity circuits (30003 ms)
#
# ● prover/bb_prover/parity › proves the parity circuits
#
# thrown: "Exceeded timeout of 30000 ms for a test.
# Add a timeout value to this test to increase the timeout, if this is a long-running test. See https://jestjs.io/docs/api#testname-fn-timeout."
- regex: "prover-client/src/test/bb_prover_parity.test.ts"
owners:
- "U04TPBU26E8" # palla
- regex: "prover-client/src/proving_broker/broker_prover_facade.test.ts"
owners:
- "U04TPBU26E8" # palla
- regex: "prover-client/src/orchestrator/orchestrator_errors.test.ts"
owners:
- "U04TPBU26E8" # palla

# yarn-project tests
- regex: "p2p/src/services/reqresp/reqresp.test.ts"
owners:
- "U04DT239VQU" # sean
- regex: "sequencer-client/src/slasher/slasher_client.test.ts"
owners:
- "U03E5SYLY3Z" # lasse

# kind tests
- regex: "spartan/bootstrap.sh"
owners:
- "UKUMA5J7K" # charlie
- "U04BM8H25NJ" # adam

# Slack testing.
- regex: "nonsense to match"
owners:
- "UKUMA5J7K" # charlie
110 changes: 0 additions & 110 deletions .test_skip_patterns

This file was deleted.

4 changes: 4 additions & 0 deletions ci3/bootstrap_ec2
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ else
fi
fi

# Allow override.
cores=${CPUS:-$cores}

# Trap function to terminate our running instance when the script exits.
function on_exit {
set +e
Expand Down Expand Up @@ -157,6 +160,7 @@ ssh ${ssh_args:-} -F $ci3/aws/build_instance_ssh_config ubuntu@$ip "
-e GITHUB_TOKEN=${GITHUB_TOKEN:-} \
-e NETLIFY_SITE_ID=${NETLIFY_SITE_ID:-} \
-e NETLIFY_AUTH_TOKEN=${NETLIFY_AUTH_TOKEN:-} \
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN:-} \
--pids-limit=32768 \
aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script")
"
Expand Down
19 changes: 14 additions & 5 deletions ci3/denoise
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ fi

dots_per_line=${LINE_WIDTH:-64}
dot_count=0
status=0
status="in-progress"
time="in-progress"

# We don't want to lose color just because we're wrapping.
export FORCE_COLOR=${FORCE_COLOR:-1}
Expand All @@ -38,14 +39,21 @@ function publish_log {
fi

{
echo -e "Command: $cmd (exit: $status)\n"
echo "Command: $cmd"
echo " Date: $(date)"
echo " Status: $status"
echo " Took: ${time}"
echo

cat $outfile
} | redis_setexz $key $CI_REDIS_EXPIRE
}

function live_publish_log {
while [ -f $outfile ]; do
publish_log
if [ $(( $(date +%s) - $(stat -c %Y "$outfile") )) -le 5 ]; then
publish_log
fi
sleep 5
done
}
Expand All @@ -71,15 +79,16 @@ done;
# Get the exit status of the command
status=${PIPESTATUS[0]}

time="${SECONDS}s"
publish_log

# Handle non-zero exit status
if [ "$status" -ne 0 ]; then
echo -e "\nCommand exited with status $status. Dumping output:"
cat $outfile
echo -e ". ${red}failed${reset} (${SECONDS}s) ${log_info:-}"
echo -e ". ${red}failed${reset} ($time) ${log_info:-}"
else
echo -e ". ${green}done${reset} (${SECONDS}s)"
echo -e ". ${green}done${reset} ($time)"
fi

exit $status
2 changes: 1 addition & 1 deletion ci3/filter_cached_test_cmd
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function process_batch {
}
export -f process_batch

if [ "${USE_TEST_CACHE:-0}" -eq 1 ] && [ "$CI_REDIS_AVAILABLE" -eq 1 ]; then
if [ "$USE_TEST_CACHE" -eq 1 ] && [ "$CI_REDIS_AVAILABLE" -eq 1 ]; then
exec 3> >(cache_log "Skipped tests" >/dev/null)
# Process stdin in batches of 50.
parallel --pipe -N50 --keep-order process_batch
Expand Down
5 changes: 5 additions & 0 deletions ci3/parallelise
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ cd $root
jobs=$(get_num_cpus_max ${1:-})
parallel_args="-j$jobs --memsuspend ${MEMSUSPEND:-64G} --line-buffer --joblog joblog.txt"

# If not in CI, fail fast.
if [ "$CI" -eq 0 ]; then
parallel_args+=" --halt now,fail=1"
fi

echo "Starting test run with max $jobs jobs..."

# If we're in a terminal default to a progress bar, and use cache_log to save output to redis.
Expand Down
Loading
Loading