AztecProtocol · charlielye · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
@@ -29,7 +29,7 @@ jobs:
       matrix:
         # Only run arm64 build with arm64-ci label or on master.
         # The way to do conditions here is to parse full strings as JSON.
-        settings:  >-
+        settings: >-
           ${{ fromJson(
             (contains(github.event.pull_request.labels.*.name, 'arm64-ci') || github.ref_name == 'master') &&
             '[{"arch":"amd64"},{"arch":"arm64"}]' ||
@@ -76,6 +76,7 @@ jobs:
           NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
           NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
           DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
         run: |
           ./ci.sh ec2
 
@@ -170,6 +171,7 @@ jobs:
           INSTANCE_POSTFIX: ${{ matrix.number }}
           DRY_RUN: 1
           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
         run: |
           ./ci.sh ec2-test
 

diff --git a/.test_patterns.yml b/.test_patterns.yml
@@ -0,0 +1,139 @@
+# Describes tests that we either:
+#   - Outright skip.
+#   - Run and alert owners on failure.
+# To outright skip a test add a "skip: true" property.
+# Only do this for tests that are currently expected to fail constantly and the noise is unbearable!
+# Otherwise, tests must have owner(s). Owners are identified by their Slack id, (profile, three dots, copy member id).
+# If there is no owner for failed test the build will fail, regardless of if it matches in this file.
+# When a failing test matches one or more of the "regex" properties below,
+# a message is sent to slack channel #aztec3-ci tagging the owners.
+
+tests:
+  # barretenberg
+  #
+  # Rare. But I saw it happen twice in 10 CI runs. Then twice in 10000 mainframe runs. Today I can't reproduce.
+  # Grind with: seq 1 10000 | parallel --bar "barretenberg/cpp/scripts/run_test.sh join_split_example_tests join_split_tests.test_defi_deposit_second_bridge_output_in_use_and_same_virtual_bridge_output_asset_ids >/dev/null"
+  # Logic failed: field_t::range_constraint
+  # /home/aztec-dev/aztec-packages/barretenberg/cpp/src/barretenberg/examples/join_split/join_split.test.cpp:1735: Failure
+  # Value of: result.valid
+  #   Actual: false
+  # Expected: true
+  - regex: "join_split_example_tests"
+    owners:
+      - "U03JYU7AQET" # luke
+
+  # noir
+  # Something to do with how I run the tests now. Think these are fine in nextest.
+  - regex: "noir_lsp-.* notifications::notification_tests::test_caches_open_files"
+    skip: true
+    owners:
+      - "UKUMA5J7K" # charlie
+  - regex: "noir_lsp-.* requests::"
+    skip: true
+    owners:
+      - "UKUMA5J7K" # charlie
+  # Sometimes see this on ARM. But not when run on it's own...
+  # FAILED 6a60c4e796ac0aef: noir/scripts/run_test.sh debug-21ff1948430ded06 tests::debug_ram_blowup_regression (code: 101)
+  #                                                                                                                                                                                running 1 test
+  # test tests::debug_ram_blowup_regression has been running for over 60 seconds                                                                                                   test tests::debug_ram_blowup_regression ... FAILED
+  #                                                                                                                                                                                failures:
+  #                                                                                                                                                                                ---- tests::debug_ram_blowup_regression stdout ----
+  # thread 'tests::debug_ram_blowup_regression' panicked at tooling/debugger/tests/debug.rs:27:14:                                                                                 Could not start debugger: Timeout { expected: "Regex: \".*\Starting debugger.*\"", got: "`^`[?2004l`\r``\r``\n`
+  # Waiting for lock on Nargo.toml...`\r``\n`                                                                                                                                      ", timeout: 30s }
+  # note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
+  #                                                                                                                                                                                failures:
+  #     tests::debug_ram_blowup_regression
+  # test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 232 filtered out; finished in 60.34s
+  - regex: "tests::debug_ram_blowup_regression"
+    skip: true
+    owners:
+      - "UKUMA5J7K" # charlie
+
+  # Seen this error on all the below.
+  # e.g. to grind: seq 1 16 | parallel --bar --tag --halt now,fail=1 ci3/dump_fail "NAME_POSTFIX=_{} yarn-project/end-to-end/scripts/run_test.sh simple e2e_p2p/gossip_network >/dev/null"
+  # FAIL  e2e_p2p/reqresp.test.ts
+  #  ● e2e_p2p_reqresp_tx › should produce an attestation by requesting tx data over the p2p network
+
+  #    TypeError: Cannot read properties of null (reading 'address')
+
+  #      19 |
+  #      20 |   get(): T | undefined {
+  #    > 21 |     return this.#db.get(this.#slot);
+  #         |                     ^
+  #      22 |   }
+  #      23 |
+  #      24 |   getAsync(): Promise<T | undefined> {
+
+  #      at LMDBStore.getBinaryFast (../../node_modules/lmdb/read.js:90:9)
+  #      at LMDBStore.get (../../node_modules/lmdb/read.js:334:22)
+  #      at LmdbAztecSingleton.get (../../kv-store/src/lmdb/singleton.ts:21:21)
+  #      at initStoreForRollup (../../kv-store/src/utils.ts:26:82)
+  #      at createStore (../../kv-store/src/lmdb/index.ts:25:12)
+  #      at createArchiver (../../archiver/src/factory.ts:30:25)
+  #      at Function.createAndSync (../../aztec-node/src/aztec-node/server.ts:157:28)
+  #      at createAndSync (fixtures/setup_p2p_test.ts:72:33)
+  #          at async Promise.all (index 0)
+  #      at Object.<anonymous> (e2e_p2p/reqresp.test.ts:66:13)
+  - regex: "simple e2e_p2p/"
+    owners:
+      - "U04DT239VQU" # sean
+
+  # FAIL  ./flakey_e2e_inclusion_proofs_contract.test.ts
+  # ● e2e_inclusion_proofs_contract › contract inclusion › proves public deployment of a contract
+  #
+  #   Undefined argument value of type field
+  #
+  #     41 |   private encodeArgument(abiType: AbiType, arg: any, name?: string) {
+  #     42 |     if (arg === undefined || arg == null) {
+  #   > 43 |       throw new Error(`Undefined argument ${name ?? 'unnamed'} of type ${abiType.kind}`);
+  #        |             ^
+  #     44 |     }
+  #     45 |     switch (abiType.kind) {
+  #     46 |       case 'field':
+  #
+  #     at ArgumentEncoder.encodeArgument (../../stdlib/src/abi/encoder.ts:43:13)
+  #     at ArgumentEncoder.encode (../../stdlib/src/abi/encoder.ts:137:12)
+  #     at encodeArguments (../../stdlib/src/abi/encoder.ts:150:41)
+  #     at computeInitializationHash (../../stdlib/src/contract/contract_address.ts:75:20)
+  #     at getContractInstanceFromDeployParams (../../stdlib/src/contract/contract_instance.ts:124:9)
+  #     at Object.getContractInstanceFromDeployParams (flakey_e2e_inclusion_proofs_contract.test.ts:275:24)
+  - regex: "simple flakey_e2e_inclusion_proofs_contract"
+    owners:
+      - "UKUMA5J7K" # charlie
+
+  # FAIL  src/test/bb_prover_parity.test.ts (34.059 s)
+  #  prover/bb_prover/parity
+  #    ✕ proves the parity circuits (30003 ms)
+  #
+  #  ● prover/bb_prover/parity › proves the parity circuits
+  #
+  #    thrown: "Exceeded timeout of 30000 ms for a test.
+  #    Add a timeout value to this test to increase the timeout, if this is a long-running test. See https://jestjs.io/docs/api#testname-fn-timeout."
+  - regex: "prover-client/src/test/bb_prover_parity.test.ts"
+    owners:
+      - "U04TPBU26E8" # palla
+  - regex: "prover-client/src/proving_broker/broker_prover_facade.test.ts"
+    owners:
+      - "U04TPBU26E8" # palla
+  - regex: "prover-client/src/orchestrator/orchestrator_errors.test.ts"
+    owners:
+      - "U04TPBU26E8" # palla
+
+  # yarn-project tests
+  - regex: "p2p/src/services/reqresp/reqresp.test.ts"
+    owners:
+      - "U04DT239VQU" # sean
+  - regex: "sequencer-client/src/slasher/slasher_client.test.ts"
+    owners:
+      - "U03E5SYLY3Z" # lasse
+
+  # kind tests
+  - regex: "spartan/bootstrap.sh"
+    owners:
+      - "UKUMA5J7K" # charlie
+      - "U04BM8H25NJ" # adam
+
+  # Slack testing.
+  - regex: "nonsense to match"
+    owners:
+      - "UKUMA5J7K" # charlie
diff --git a/.test_skip_patterns b/.test_skip_patterns
@@ -18,6 +18,9 @@ else
   fi
 fi
 
+# Allow override.
+cores=${CPUS:-$cores}
+
 # Trap function to terminate our running instance when the script exits.
 function on_exit {
     set +e
@@ -157,6 +160,7 @@ ssh ${ssh_args:-} -F $ci3/aws/build_instance_ssh_config ubuntu@$ip "
     -e GITHUB_TOKEN=${GITHUB_TOKEN:-} \
     -e NETLIFY_SITE_ID=${NETLIFY_SITE_ID:-} \
     -e NETLIFY_AUTH_TOKEN=${NETLIFY_AUTH_TOKEN:-} \
+    -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN:-} \
     --pids-limit=32768 \
     aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script")
 "

@@ -18,7 +18,8 @@ fi
 
 dots_per_line=${LINE_WIDTH:-64}
 dot_count=0
-status=0
+status="in-progress"
+time="in-progress"
 
 # We don't want to lose color just because we're wrapping.
 export FORCE_COLOR=${FORCE_COLOR:-1}
@@ -38,14 +39,21 @@ function publish_log {
   fi
 
   {
-    echo -e "Command: $cmd (exit: $status)\n"
+    echo "Command: $cmd"
+    echo "   Date: $(date)"
+    echo " Status: $status"
+    echo "   Took: ${time}"
+    echo
+
     cat $outfile
   } | redis_setexz $key $CI_REDIS_EXPIRE
 }
 
 function live_publish_log {
   while [ -f $outfile ]; do
-    publish_log
+    if [ $(( $(date +%s) - $(stat -c %Y "$outfile") )) -le 5 ]; then
+      publish_log
+    fi
     sleep 5
   done
 }
@@ -71,15 +79,16 @@ done;
 # Get the exit status of the command
 status=${PIPESTATUS[0]}
 
+time="${SECONDS}s"
 publish_log
 
 # Handle non-zero exit status
 if [ "$status" -ne 0 ]; then
   echo -e "\nCommand exited with status $status. Dumping output:"
   cat $outfile
-  echo -e ". ${red}failed${reset} (${SECONDS}s) ${log_info:-}"
+  echo -e ". ${red}failed${reset} ($time) ${log_info:-}"
 else
-  echo -e ". ${green}done${reset} (${SECONDS}s)"
+  echo -e ". ${green}done${reset} ($time)"
 fi
 
 exit $status
@@ -28,7 +28,7 @@ function process_batch {
 }
 export -f process_batch
 
-if [ "${USE_TEST_CACHE:-0}" -eq 1 ] && [ "$CI_REDIS_AVAILABLE" -eq 1 ]; then
+if [ "$USE_TEST_CACHE" -eq 1 ] && [ "$CI_REDIS_AVAILABLE" -eq 1 ]; then
   exec 3> >(cache_log "Skipped tests" >/dev/null)
   # Process stdin in batches of 50.
   parallel --pipe -N50 --keep-order process_batch

@@ -7,6 +7,11 @@ cd $root
 jobs=$(get_num_cpus_max ${1:-})
 parallel_args="-j$jobs --memsuspend ${MEMSUSPEND:-64G} --line-buffer --joblog joblog.txt"
 
+# If not in CI, fail fast.
+if [ "$CI" -eq 0 ]; then
+  parallel_args+=" --halt now,fail=1"
+fi
+
 echo "Starting test run with max $jobs jobs..."
 
 # If we're in a terminal default to a progress bar, and use cache_log to save output to redis.