From af074243bd8e2d176fdda1f49af6c55f79cc8cd9 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Tue, 29 Oct 2024 15:47:33 -0400 Subject: [PATCH 01/17] initial version of tests for SHiP restarting from corrupted log and index files --- tests/CMakeLists.txt | 3 + tests/ship_restart_test.py | 314 +++++++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100755 tests/ship_restart_test.py diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a6194c13f1..e84d653199 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -53,6 +53,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/full-version-label.sh ${CMAKE_CURRENT configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nodeos_producer_watermark_test.py ${CMAKE_CURRENT_BINARY_DIR}/nodeos_producer_watermark_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cli_test.py ${CMAKE_CURRENT_BINARY_DIR}/cli_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_reqs_across_svnn_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_reqs_across_svnn_test.py COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_restart_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_restart_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_streamer_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_streamer_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ship_kill_client_test.py ${CMAKE_CURRENT_BINARY_DIR}/ship_kill_client_test.py COPYONLY) @@ -175,6 +176,8 @@ set_property(TEST production_restart PROPERTY LABELS nonparallelizable_tests) add_test(NAME ship_reqs_across_svnn_test COMMAND tests/ship_reqs_across_svnn_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST ship_reqs_across_svnn_test PROPERTY LABELS nonparallelizable_tests) +add_test(NAME ship_restart_test COMMAND tests/ship_restart_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +set_property(TEST ship_restart_test PROPERTY LABELS nonparallelizable_tests) add_test(NAME ship_test COMMAND tests/ship_test.py -v --num-clients 10 --num-requests 5000 ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST ship_test PROPERTY LABELS nonparallelizable_tests) add_test(NAME ship_test_unix COMMAND tests/ship_test.py -v --num-clients 10 --num-requests 5000 ${UNSHARE} --unix-socket WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/tests/ship_restart_test.py b/tests/ship_restart_test.py new file mode 100755 index 0000000000..47312f29d8 --- /dev/null +++ b/tests/ship_restart_test.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 + +import os +import tempfile +import shutil +import signal + +from TestHarness import Cluster, TestHelper, Utils, WalletMgr + +############################################################################### +# ship_restart_test +# +# This test verifies SHiP shuts down gracefully or recovers when restarting +# with various scenarios of corrupted log and/or index files. +# +############################################################################### + +Print=Utils.Print + +args = TestHelper.parse_args({"--dump-error-details","--keep-logs","-v","--leave-running","--unshared"}) + +Utils.Debug=args.v +cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs) +dumpErrorDetails=args.dump_error_details +walletPort=TestHelper.DEFAULT_WALLET_PORT + +totalProducerNodes=1 +totalNonProducerNodes=1 # for SHiP node +totalNodes=totalProducerNodes+totalNonProducerNodes + +walletMgr=WalletMgr(True, port=walletPort) +testSuccessful=False + +prodNodeId = 0 +shipNodeId = 1 + +origStateHistoryLog = "" +stateHistoryLog = "" +origStateHistoryIndex = "" +stateHistoryIndex = "" + +# Returns True if file1 and file2 contain the same content +def equalFiles(file1, file2): + # not the same if sizes are different + if os.path.getsize(file1) != os.path.getsize(file2): + return False + + readSize=1024 + with open(file1, 'rb') as f1, open(file2, 'rb') as f2: + while True: + bytes1 = f1.read(readSize) + bytes2 = f2.read(readSize) + + if bytes1 != bytes2: + return False + + # end of both files + if not bytes1: + return True + +# Verifies that SHiP should fail to restart with a corrupted first entry header +def corruptedHeaderTest(pos, curruptedValue, shipNode): + # restore log and index + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + with open(stateHistoryLog, 'rb+') as f: # opened as binary file + f.seek(pos) # seek to the position to corrupt + f.write(curruptedValue) # corrupt it + + isRelaunchSuccess = shipNode.relaunch() + assert not isRelaunchSuccess, "SHiP node should have failed to relaunch" + +try: + TestHelper.printSystemInfo("BEGIN") + + cluster.setWalletMgr(walletMgr) + Print("Stand up cluster") + + specificExtraNodeosArgs={} + specificExtraNodeosArgs[shipNodeId]="--plugin eosio::state_history_plugin --trace-history --chain-state-history --finality-data-history --state-history-stride 200 --plugin eosio::net_api_plugin --plugin eosio::producer_api_plugin" + + if cluster.launch(topo="mesh", pnodes=totalProducerNodes, totalNodes=totalNodes, + activateIF=True, + specificExtraNodeosArgs=specificExtraNodeosArgs) is False: + Utils.cmdError("launcher") + Utils.errorExit("Failed to stand up cluster.") + + # Verify nodes are in sync and advancing + cluster.waitOnClusterSync(blockAdvancing=5) + Print("Cluster in Sync") + + Print("Shutdown unneeded bios node") + cluster.biosNode.kill(signal.SIGTERM) + + prodNode = cluster.getNode(prodNodeId) + shipNode = cluster.getNode(shipNodeId) + + Print("Shutdown producer and SHiP nodes") + prodNode.kill(signal.SIGTERM) + shipNode.kill(signal.SIGTERM) + + shipDir = os.path.join(Utils.getNodeDataDir(shipNodeId), "state-history") + stateHistoryLog = os.path.join(shipDir, "chain_state_history.log") + stateHistoryIndex = os.path.join(shipDir, "chain_state_history.index") + tmpDir = tempfile.mkdtemp() + origStateHistoryLog = os.path.join(tmpDir, "chain_state_history.log") + origStateHistoryIndex = os.path.join(tmpDir, "chain_state_history.index") + + # save original chain_state_history log and index files + Print("Save original SHiP log and index") + shutil.copyfile(stateHistoryLog, origStateHistoryLog) + shutil.copyfile(stateHistoryIndex, origStateHistoryIndex) + + ############## Part 1: restart tests while producer node is down ################# + + #-------- Index file is removed. It should be regenerated at restart. + Print("index file removed test") + + os.remove(stateHistoryIndex) + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert equalFiles(stateHistoryLog, origStateHistoryLog) # log unchanged + assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index regenerated + + shipNode.kill(signal.SIGTERM) # shut down ship node for next test + + ''' + Test failure 1: index file was not regenerated. Reenable this after issue is fixed. + + #-------- Index file last entry is corrupted. It should be regenerated at restart. + with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file + # seek to last entry (8 bytes before the end of file) + stateHistoryIndexFile.seek(-8, 2) # -8 for backward, 2 for starting at end + + # set the index to a random value + stateHistoryIndexFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07') + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert equalFiles(stateHistoryLog, origStateHistoryLog) + assert equalFiles(stateHistoryIndex, origStateHistoryIndex) + ''' + + #-------- Truncate index file. It should be regenerated + # because index size is not the same as expected size + Print("Truncated index file test") + + # restore log and index + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + with open(stateHistoryIndex, 'rb+') as f: + indexFileSize = os.path.getsize(stateHistoryIndex) + newSize = indexFileSize - 8 + f.truncate(newSize) + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert equalFiles(stateHistoryLog, origStateHistoryLog) # log file unchanged + assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index file regenerated + + shipNode.kill(signal.SIGTERM) # shut down it for next test + + #-------- Add an extra entry to index file. It should be regenerated + # because index size is not the same as expected size + Print("Extra entry index file test") + + # restore log and index + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file + # seek to last index, 8 bytes before the end of file + stateHistoryIndexFile.seek(0, 2) # -8 for backward, 2 for starting at end + + # write a small value + stateHistoryIndexFile.write(b'\x00\x00\x00\x00\x00\x00\x01\x0F') + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert equalFiles(stateHistoryLog, origStateHistoryLog) # log file not changed + assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index file regenerated + + shipNode.kill(signal.SIGTERM) # shut down it for next test + + #-------- Remove log file. The log file should be reconstructed from state + Print("Removed log file test") + + # restore index + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + os.remove(stateHistoryLog) + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + + shipNode.kill(signal.SIGTERM) # shut down it for next test + + #-------- Corrupt first entry's magic. Relaunch should fail + Print("first entry magic corruption test") + + corruptedHeaderTest(0, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 0 is magic's position + + #-------- Corrupt first entry's block_id. Relaunch should fail + Print("first entry block_id corruption test") + + corruptedHeaderTest(8, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 8 is block_id's position + + ''' + # Test failure 1: Reenable this after issue is fixed. + #-------- Corrupt first entry's payload_size. Relaunch should fail + Print("first entry payload_size corruption test") + + corruptedHeaderTest(40, b'\x00\x00\x00\x00\x00\x00\x00\x01', shipNode) # 40 is payload_size position + ''' + + ''' + # Test failure 2: Reenable this after issue is fixed. + #-------- Corrupt last entry's position . It should be repaired. + # After producer node restarts, head on SHiP node should advance. + Print("last entry postion corruption test") + + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + with open(stateHistoryLog, 'rb+') as stateHistoryLogFile: # opened as binary file + # seek to last index (8 bytes before the end of file) + stateHistoryLogFile.seek(-8, 2) # -8 for backward, 2 for starting at end + + # set the index to a random value + stateHistoryLogFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07') + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production") + assert isRelaunchSuccess, "Failed to relaunch prodNode" + + assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode" + prodNode.kill(signal.SIGTERM) + shipNode.kill(signal.SIGTERM) + ''' + + ''' + # Test failure 3: Reenable this after issue is fixed. + #-------- Corrupt last entry's header. It should be repaired. + # After producer node restarts, head on SHiP node should advance. + Print("last entry header corruption test") + + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + with open(stateHistoryLog, 'rb+') as f: # opened as binary file + # seek to last index (8 bytes before the end of file) + f.seek(-8, 2) # -8 for backward, 2 for starting at end + + data = f.read(8) + integer_value = int.from_bytes(data, byteorder='little') + f.seek(integer_value) + + # corrupt the header + f.write(b'\x00\x01\x02\x03\x04\x05\x06\x07') + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production") + assert isRelaunchSuccess, "Failed to relaunch prodNode" + + assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode" + prodNode.kill(signal.SIGTERM) + shipNode.kill(signal.SIGTERM) + ''' + + ############## Part 2: restart tests while producer node is up ################# + + isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production") + assert isRelaunchSuccess, "Failed to relaunch prodNode" + + shutil.copyfile(origStateHistoryLog, stateHistoryLog) + shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) + + #-------- Index file is removed. It should be regenerated at restart + os.remove(stateHistoryIndex) + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode" + + shipNode.kill(signal.SIGTERM) # shut down it for next test + + ''' + #-------- Corrupt last entry of log file. It should be repaired + # and LIB should advance + with open(stateHistoryLog, 'rb+') as stateHistoryLogFile: # opened as binary file + # seek to last index, 8 bytes before the end of file + stateHistoryLogFile.seek(-8, 2) # -8 for backward, 2 for starting at end + + # set the index to a random value + stateHistoryLogFile.write(b'\x00\x01\x02\x03\x04\x05\x06\x07') + + isRelaunchSuccess = shipNode.relaunch() + assert isRelaunchSuccess, "Failed to relaunch shipNode" + assert shipNode.waitForLibToAdvance(), "LIB did not advance on shipNode" + ''' + + testSuccessful = True +finally: + TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails) + if tmpDir is not None: + shutil.rmtree(tmpDir, ignore_errors=True) + +errorCode = 0 if testSuccessful else 1 +exit(errorCode) From d47c1f09f7d6654ae3d224c4c2aae1dde3ceef1d Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Wed, 30 Oct 2024 09:59:31 -0400 Subject: [PATCH 02/17] Update with github issues for failed tests and minor comment changes --- tests/ship_restart_test.py | 40 ++++++++++++++------------------------ 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/tests/ship_restart_test.py b/tests/ship_restart_test.py index 47312f29d8..fbc410c50d 100755 --- a/tests/ship_restart_test.py +++ b/tests/ship_restart_test.py @@ -112,7 +112,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shutil.copyfile(stateHistoryLog, origStateHistoryLog) shutil.copyfile(stateHistoryIndex, origStateHistoryIndex) - ############## Part 1: restart tests while producer node is down ################# + ############## Part 1: tests while producer node is down ################# #-------- Index file is removed. It should be regenerated at restart. Print("index file removed test") @@ -127,7 +127,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shipNode.kill(signal.SIGTERM) # shut down ship node for next test ''' - Test failure 1: index file was not regenerated. Reenable this after issue is fixed. + Test failure 1: index file was not regenerated. Reenable this after https://github.com/AntelopeIO/spring/issues/990 is fixed. #-------- Index file last entry is corrupted. It should be regenerated at restart. with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file @@ -153,7 +153,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): with open(stateHistoryIndex, 'rb+') as f: indexFileSize = os.path.getsize(stateHistoryIndex) - newSize = indexFileSize - 8 + newSize = indexFileSize - 8 # truncate 8 bytes f.truncate(newSize) isRelaunchSuccess = shipNode.relaunch() @@ -165,18 +165,15 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): #-------- Add an extra entry to index file. It should be regenerated # because index size is not the same as expected size - Print("Extra entry index file test") + Print("Extra entry in index file test") # restore log and index shutil.copyfile(origStateHistoryLog, stateHistoryLog) shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) with open(stateHistoryIndex, 'rb+') as stateHistoryIndexFile: # opened as binary file - # seek to last index, 8 bytes before the end of file - stateHistoryIndexFile.seek(0, 2) # -8 for backward, 2 for starting at end - - # write a small value - stateHistoryIndexFile.write(b'\x00\x00\x00\x00\x00\x00\x01\x0F') + stateHistoryIndexFile.seek(0, 2) # seek to end of file + stateHistoryIndexFile.write(b'\x00\x00\x00\x00\x00\x00\x01\x0F') # write a small value isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" @@ -186,9 +183,9 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shipNode.kill(signal.SIGTERM) # shut down it for next test #-------- Remove log file. The log file should be reconstructed from state + # and restart succeeds Print("Removed log file test") - # restore index shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) os.remove(stateHistoryLog) @@ -200,24 +197,14 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): #-------- Corrupt first entry's magic. Relaunch should fail Print("first entry magic corruption test") - corruptedHeaderTest(0, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 0 is magic's position #-------- Corrupt first entry's block_id. Relaunch should fail Print("first entry block_id corruption test") - corruptedHeaderTest(8, b'\x00\x01\x02\x03\x04\x05\x06\x07', shipNode) # 8 is block_id's position ''' - # Test failure 1: Reenable this after issue is fixed. - #-------- Corrupt first entry's payload_size. Relaunch should fail - Print("first entry payload_size corruption test") - - corruptedHeaderTest(40, b'\x00\x00\x00\x00\x00\x00\x00\x01', shipNode) # 40 is payload_size position - ''' - - ''' - # Test failure 2: Reenable this after issue is fixed. + # Test failure 2: Reenable this after https://github.com/AntelopeIO/spring/issues/989 is fixed. #-------- Corrupt last entry's position . It should be repaired. # After producer node restarts, head on SHiP node should advance. Print("last entry postion corruption test") @@ -243,7 +230,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): ''' ''' - # Test failure 3: Reenable this after issue is fixed. + # Test failure 3: Reenable this after https://github.com/AntelopeIO/spring/issues/989 is fixed. #-------- Corrupt last entry's header. It should be repaired. # After producer node restarts, head on SHiP node should advance. Print("last entry header corruption test") @@ -272,7 +259,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shipNode.kill(signal.SIGTERM) ''' - ############## Part 2: restart tests while producer node is up ################# + ############## Part 2: tests while producer node is up ################# isRelaunchSuccess = prodNode.relaunch(chainArg="--enable-stale-production") assert isRelaunchSuccess, "Failed to relaunch prodNode" @@ -281,6 +268,8 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) #-------- Index file is removed. It should be regenerated at restart + Print("Index file removed while producer node is up test") + os.remove(stateHistoryIndex) isRelaunchSuccess = shipNode.relaunch() @@ -290,8 +279,9 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): shipNode.kill(signal.SIGTERM) # shut down it for next test ''' + # Test failure 4: Reenable this after issue https://github.com/AntelopeIO/spring/issues/989 fixed. #-------- Corrupt last entry of log file. It should be repaired - # and LIB should advance + # and head should advance with open(stateHistoryLog, 'rb+') as stateHistoryLogFile: # opened as binary file # seek to last index, 8 bytes before the end of file stateHistoryLogFile.seek(-8, 2) # -8 for backward, 2 for starting at end @@ -301,7 +291,7 @@ def corruptedHeaderTest(pos, curruptedValue, shipNode): isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" - assert shipNode.waitForLibToAdvance(), "LIB did not advance on shipNode" + assert shipNode.waitForHeadToAdvance(), "Head did not advance on shipNode" ''' testSuccessful = True From 73120f5bdf822b7dc1205a47b4be59592ac3db39 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Wed, 30 Oct 2024 13:52:22 -0500 Subject: [PATCH 03/17] GH-985 Add ability to interrupt transaction in apply_block --- libraries/chain/controller.cpp | 37 +++++++++++++++++-- .../chain/include/eosio/chain/controller.hpp | 3 ++ .../chain/include/eosio/chain/exceptions.hpp | 8 ++-- .../include/eosio/chain/platform_timer.hpp | 1 + .../chain/platform_timer_asio_fallback.cpp | 5 +++ libraries/chain/platform_timer_kqueue.cpp | 5 +++ libraries/chain/platform_timer_posix.cpp | 5 +++ libraries/chain/transaction_context.cpp | 5 ++- programs/nodeos/main.cpp | 5 +++ 9 files changed, 66 insertions(+), 8 deletions(-) diff --git a/libraries/chain/controller.cpp b/libraries/chain/controller.cpp index fd953c585b..8fa55cadf0 100644 --- a/libraries/chain/controller.cpp +++ b/libraries/chain/controller.cpp @@ -1006,6 +1006,8 @@ struct controller_impl { async_t async_aggregation = async_t::yes; // by default we process incoming votes asynchronously my_finalizers_t my_finalizers; std::atomic writing_snapshot = false; + std::atomic applying_block = false; + platform_timer& main_thread_timer; thread_local static platform_timer timer; // a copy for main thread and each read-only thread #if defined(EOSIO_EOS_VM_RUNTIME_ENABLED) || defined(EOSIO_EOS_VM_JIT_RUNTIME_ENABLED) @@ -1285,6 +1287,7 @@ struct controller_impl { read_mode( cfg.read_mode ), thread_pool(), my_finalizers(cfg.finalizers_dir / config::safety_filename), + main_thread_timer(timer), // assumes constructor is called from main thread wasmif( conf.wasm_runtime, conf.eosvmoc_tierup, db, conf.state_dir, conf.eosvmoc_config, !conf.profile_accounts.empty() ) { assert(cfg.chain_thread_pool_size > 0); @@ -3780,6 +3783,9 @@ struct controller_impl { } } + applying_block = true; + auto apply = fc::make_scoped_exit([&](){ applying_block = false; }); + transaction_trace_ptr trace; size_t packed_idx = 0; @@ -3806,7 +3812,11 @@ struct controller_impl { std::holds_alternative(receipt.trx); if( transaction_failed && !transaction_can_fail) { - edump((*trace)); + if (trace->except->code() == interrupt_exception::code_value) { + ilog("Interrupt of trx: ${t}", ("t", *trace)); + } else { + edump((*trace)); + } throw *trace->except; } @@ -3875,7 +3885,8 @@ struct controller_impl { } catch ( const boost::interprocess::bad_alloc& ) { throw; } catch ( const fc::exception& e ) { - edump((e.to_detail_string())); + if (e.code() != interrupt_exception::code_value) + edump((e.to_detail_string())); abort_block(); throw; } catch ( const std::exception& e ) { @@ -4431,8 +4442,12 @@ struct controller_impl { } catch ( const boost::interprocess::bad_alloc& ) { throw; } catch (const fc::exception& e) { - elog("exception thrown while applying block ${bn} : ${id}, previous ${p}, error: ${e}", - ("bn", bsp->block_num())("id", bsp->id())("p", bsp->previous())("e", e.to_detail_string())); + if (e.code() == interrupt_exception::code_value) { + ilog("interrupt while applying block ${bn} : ${id}", ("bn", bsp->block_num())("id", bsp->id())); + } else { + elog("exception thrown while applying block ${bn} : ${id}, previous ${p}, error: ${e}", + ("bn", bsp->block_num())("id", bsp->id())("p", bsp->previous())("e", e.to_detail_string())); + } except = std::current_exception(); } catch (const std::exception& e) { elog("exception thrown while applying block ${bn} : ${id}, previous ${p}, error: ${e}", @@ -4495,6 +4510,16 @@ struct controller_impl { return applied_trxs; } + void interrupt_transaction() { + // Only interrupt transaction if applying a block. Speculative trxs already have a deadline set so they + // have limited run time already. This is to allow killing a long-running transaction in a block being + // validated. + if (applying_block) { + ilog("Interrupting apply block"); + main_thread_timer.expire_now(); + } + } + // @param if_active true if instant finality is active static checksum256_type calc_merkle( deque&& digests, bool if_active ) { if (if_active) { @@ -5255,6 +5280,10 @@ deque controller::abort_block() { return my->abort_block(); } +void controller::interrupt_transaction() { + my->interrupt_transaction(); +} + boost::asio::io_context& controller::get_thread_pool() { return my->thread_pool.get_executor(); } diff --git a/libraries/chain/include/eosio/chain/controller.hpp b/libraries/chain/include/eosio/chain/controller.hpp index cf8f12e7e4..fe58f39f95 100644 --- a/libraries/chain/include/eosio/chain/controller.hpp +++ b/libraries/chain/include/eosio/chain/controller.hpp @@ -206,6 +206,9 @@ namespace eosio::chain { */ deque abort_block(); + /// Expected to be called from signal handler + void interrupt_transaction(); + /** * */ diff --git a/libraries/chain/include/eosio/chain/exceptions.hpp b/libraries/chain/include/eosio/chain/exceptions.hpp index 1539b7c3be..fc6ef200b3 100644 --- a/libraries/chain/include/eosio/chain/exceptions.hpp +++ b/libraries/chain/include/eosio/chain/exceptions.hpp @@ -381,6 +381,9 @@ namespace eosio { namespace chain { 3080005, "Transaction CPU usage is too much for the remaining allowable usage of the current block" ) FC_DECLARE_DERIVED_EXCEPTION( deadline_exception, resource_exhausted_exception, 3080006, "Transaction took too long" ) + FC_DECLARE_DERIVED_EXCEPTION( leeway_deadline_exception, deadline_exception, + 3081001, "Transaction reached the deadline set due to leeway on account CPU limits" ) + FC_DECLARE_DERIVED_EXCEPTION( greylist_net_usage_exceeded, resource_exhausted_exception, 3080007, "Transaction exceeded the current greylisted account network usage limit" ) FC_DECLARE_DERIVED_EXCEPTION( greylist_cpu_usage_exceeded, resource_exhausted_exception, @@ -389,9 +392,8 @@ namespace eosio { namespace chain { 3080009, "Read-only transaction eos-vm-oc compile temporary failure" ) FC_DECLARE_DERIVED_EXCEPTION( ro_trx_vm_oc_compile_permanent_failure, resource_exhausted_exception, 3080010, "Read-only transaction eos-vm-oc compile permanent failure" ) - - FC_DECLARE_DERIVED_EXCEPTION( leeway_deadline_exception, deadline_exception, - 3081001, "Transaction reached the deadline set due to leeway on account CPU limits" ) + FC_DECLARE_DERIVED_EXCEPTION( interrupt_exception, resource_exhausted_exception, + 3080011, "Transaction interrupted by signal" ) FC_DECLARE_DERIVED_EXCEPTION( authorization_exception, chain_exception, 3090000, "Authorization exception" ) diff --git a/libraries/chain/include/eosio/chain/platform_timer.hpp b/libraries/chain/include/eosio/chain/platform_timer.hpp index 29a8d62d46..72fb8d1fef 100644 --- a/libraries/chain/include/eosio/chain/platform_timer.hpp +++ b/libraries/chain/include/eosio/chain/platform_timer.hpp @@ -17,6 +17,7 @@ struct platform_timer { void start(fc::time_point tp); void stop(); + void expire_now(); /* Sets a callback for when timer expires. Be aware this could might fire from a signal handling context and/or on any particular thread. Only a single callback can be registered at once; trying to register more will diff --git a/libraries/chain/platform_timer_asio_fallback.cpp b/libraries/chain/platform_timer_asio_fallback.cpp index 28525c7968..7ec808e128 100644 --- a/libraries/chain/platform_timer_asio_fallback.cpp +++ b/libraries/chain/platform_timer_asio_fallback.cpp @@ -84,6 +84,11 @@ void platform_timer::start(fc::time_point tp) { } } +void platform_timer::expire_now() { + expired = 1; + call_expiration_callback(); +} + void platform_timer::stop() { if(expired) return; diff --git a/libraries/chain/platform_timer_kqueue.cpp b/libraries/chain/platform_timer_kqueue.cpp index 3cb341a031..3e9dc68104 100644 --- a/libraries/chain/platform_timer_kqueue.cpp +++ b/libraries/chain/platform_timer_kqueue.cpp @@ -106,6 +106,11 @@ void platform_timer::start(fc::time_point tp) { } } +void platform_timer::expire_now() { + expired = 1; + call_expiration_callback(); +} + void platform_timer::stop() { if(expired) return; diff --git a/libraries/chain/platform_timer_posix.cpp b/libraries/chain/platform_timer_posix.cpp index bb000de5c3..0a50ebcade 100644 --- a/libraries/chain/platform_timer_posix.cpp +++ b/libraries/chain/platform_timer_posix.cpp @@ -71,6 +71,11 @@ void platform_timer::start(fc::time_point tp) { } } +void platform_timer::expire_now() { + expired = 1; + call_expiration_callback(); +} + void platform_timer::stop() { if(expired) return; diff --git a/libraries/chain/transaction_context.cpp b/libraries/chain/transaction_context.cpp index c59a9f2f8b..279b64a982 100644 --- a/libraries/chain/transaction_context.cpp +++ b/libraries/chain/transaction_context.cpp @@ -459,7 +459,10 @@ namespace eosio::chain { return; auto now = fc::time_point::now(); - if( explicit_billed_cpu_time || deadline_exception_code == deadline_exception::code_value ) { + if (explicit_billed_cpu_time) { + EOS_THROW( interrupt_exception, "interrupt signaled, ran ${bt}us, start ${s}", + ("bt", now - pseudo_start)("s", start) ); + } else if( deadline_exception_code == deadline_exception::code_value ) { EOS_THROW( deadline_exception, "deadline exceeded ${billing_timer}us", ("billing_timer", now - pseudo_start)("now", now)("deadline", _deadline)("start", start) ); } else if( deadline_exception_code == block_cpu_usage_exceeded::code_value ) { diff --git a/programs/nodeos/main.cpp b/programs/nodeos/main.cpp index a73e7cc582..a3533da6ec 100644 --- a/programs/nodeos/main.cpp +++ b/programs/nodeos/main.cpp @@ -171,6 +171,8 @@ int main(int argc, char** argv) app->set_stop_executor_cb([&app]() { ilog("appbase quit called"); app->get_io_context().stop(); + auto& chain = app->get_plugin().chain(); + chain.interrupt_transaction(); }); app->set_version(htonl(short_hash)); app->set_version_string(eosio::version::version_client()); @@ -220,6 +222,9 @@ int main(int argc, char** argv) elog( "database dirty flag set (likely due to unclean shutdown): replay required" ); return DATABASE_DIRTY; } + } else if (e.code() == interrupt_exception::code_value) { + ilog("Interrupted, successfully exiting"); + return SUCCESS; } elog( "${e}", ("e", e.to_detail_string())); return OTHER_FAIL; From 2ff14e76cec8a8c5717dd0d1966a9a3648749aa0 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Wed, 30 Oct 2024 13:53:39 -0500 Subject: [PATCH 04/17] GH-985 Add unittest for interrupt_transaction --- tests/chain_test_utils.hpp | 35 ++++++++++++++++++++----- unittests/checktime_tests.cpp | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/tests/chain_test_utils.hpp b/tests/chain_test_utils.hpp index 67f4e5f30c..2e4de93272 100644 --- a/tests/chain_test_utils.hpp +++ b/tests/chain_test_utils.hpp @@ -49,6 +49,17 @@ struct reqactivated { } }; +inline private_key_type get_private_key( name keyname, string role ) { + if (keyname == config::system_account_name) + return private_key_type::regenerate(fc::sha256::hash(std::string("nathan"))); + + return private_key_type::regenerate(fc::sha256::hash(keyname.to_string()+role)); +} + +inline public_key_type get_public_key( name keyname, string role ){ + return get_private_key( keyname, role ).get_public_key(); +} + // Create a read-only trx that works with bios reqactivated action inline auto make_bios_ro_trx(eosio::chain::controller& control) { const auto& pfm = control.get_protocol_feature_manager(); @@ -66,12 +77,7 @@ inline auto make_bios_ro_trx(eosio::chain::controller& control) { inline auto push_input_trx(appbase::scoped_app& app, eosio::chain::controller& control, account_name account, signed_transaction& trx) { trx.expiration = fc::time_point_sec{fc::time_point::now() + fc::seconds(30)}; trx.set_reference_block( control.head().id() ); - if (account == config::system_account_name) { - auto default_priv_key = private_key_type::regenerate(fc::sha256::hash(std::string("nathan"))); - trx.sign(default_priv_key, control.get_chain_id()); - } else { - trx.sign(testing::tester::get_private_key(account, "active"), control.get_chain_id()); - } + trx.sign(get_private_key(account, "active"), control.get_chain_id()); auto ptrx = std::make_shared( trx, packed_transaction::compression_type::zlib ); auto trx_promise = std::make_shared>(); @@ -120,6 +126,23 @@ inline auto set_code(appbase::scoped_app& app, eosio::chain::controller& control return push_input_trx(app, control, account, trx); } +inline transaction_trace_ptr create_account(appbase::scoped_app& app, eosio::chain::controller& control, account_name a, account_name creator) { + signed_transaction trx; + + authority owner_auth{ get_public_key( a, "owner" ) }; + authority active_auth{ get_public_key( a, "active" ) }; + + trx.actions.emplace_back( vector{{creator,config::active_name}}, + chain::newaccount{ + .creator = creator, + .name = a, + .owner = owner_auth, + .active = active_auth, + }); + + return push_input_trx(app, control, creator, trx); +} + inline void activate_protocol_features_set_bios_contract(appbase::scoped_app& app, chain_plugin* chain_plug) { using namespace appbase; diff --git a/unittests/checktime_tests.cpp b/unittests/checktime_tests.cpp index 37da24b8d7..dae5521b77 100644 --- a/unittests/checktime_tests.cpp +++ b/unittests/checktime_tests.cpp @@ -122,6 +122,55 @@ BOOST_AUTO_TEST_CASE_TEMPLATE( checktime_fail_tests, T, validating_testers ) { t BOOST_REQUIRE_EQUAL( t.validate(), true ); } FC_LOG_AND_RETHROW() } +BOOST_AUTO_TEST_CASE( checktime_interrupt_test) { try { + savanna_tester t; + savanna_tester other; + auto block = t.produce_block(); + other.push_block(block); + t.create_account( "testapi"_n ); + t.set_code( "testapi"_n, test_contracts::test_api_wasm() ); + block = t.produce_block(); + other.push_block(block); + + auto [trace, b] = CALL_TEST_FUNCTION_WITH_BLOCK( t, "test_checktime", "checktime_pass", {}); + BOOST_REQUIRE_EQUAL( b->transactions.size(), 1 ); + + // Make a copy of the valid block and swicth the checktime_pass transaction with checktime_failure + auto copy_b = std::make_shared(b->clone()); + auto signed_tx = std::get(copy_b->transactions.back().trx).get_signed_transaction(); + auto& act = signed_tx.actions.back(); + constexpr chain::name checktime_fail_n{WASM_TEST_ACTION("test_checktime", "checktime_failure")}; + act.name = checktime_fail_n; + act.data = fc::raw::pack(10000000000000000000ULL); + // Re-sign the transaction + signed_tx.signatures.clear(); + signed_tx.sign(t.get_private_key("testapi"_n, "active"), t.get_chain_id()); + // Replace the transaction + auto new_packed_tx = packed_transaction(signed_tx); + copy_b->transactions.back().trx = std::move(new_packed_tx); + + // Re-calculate the transaction merkle + deque trx_digests; + const auto& trxs = copy_b->transactions; + for( const auto& a : trxs ) + trx_digests.emplace_back( a.digest() ); + copy_b->transaction_mroot = calculate_merkle( std::move(trx_digests) ); + // Re-sign the block + copy_b->producer_signature = t.get_private_key(config::system_account_name, "active").sign(copy_b->calculate_id()); + + std::thread th( [&c=*other.control]() { + std::this_thread::sleep_for( std::chrono::milliseconds(50) ); + c.interrupt_transaction(); + } ); + + // apply block, caught in an "infinite" loop + BOOST_CHECK_EXCEPTION( other.push_block(copy_b), fc::exception, + [](const fc::exception& e) { return e.code() == interrupt_exception::code_value; } ); + + th.join(); + +} FC_LOG_AND_RETHROW() } + BOOST_AUTO_TEST_CASE_TEMPLATE( checktime_pause_max_trx_cpu_extended_test, T, testers ) { try { fc::temp_directory tempdir; auto conf_genesis = tester::default_config( tempdir ); From a21cf18a6e43e564ed4c8f8a4af26078f0582aa1 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Wed, 30 Oct 2024 13:54:34 -0500 Subject: [PATCH 05/17] GH-985 Add integration test for interrupt_transaction --- .../test_control_api_plugin.cpp | 4 + .../test_control_plugin.hpp | 13 ++- .../test_control_plugin.cpp | 91 ++++++++++++++-- tests/CMakeLists.txt | 3 + tests/interrupt_trx_test.py | 99 ++++++++++++++++++ .../payloadless/payloadless.abi | 10 ++ .../payloadless/payloadless.cpp | 14 +++ .../payloadless/payloadless.hpp | 3 + .../payloadless/payloadless.wasm | Bin 2918 -> 3192 bytes 9 files changed, 230 insertions(+), 7 deletions(-) create mode 100755 tests/interrupt_trx_test.py mode change 100644 => 100755 unittests/test-contracts/payloadless/payloadless.wasm diff --git a/plugins/test_control_api_plugin/test_control_api_plugin.cpp b/plugins/test_control_api_plugin/test_control_api_plugin.cpp index e2b5a6892a..ce13bd6408 100644 --- a/plugins/test_control_api_plugin/test_control_api_plugin.cpp +++ b/plugins/test_control_api_plugin/test_control_api_plugin.cpp @@ -51,6 +51,10 @@ void test_control_api_plugin::plugin_startup() { TEST_CONTROL_RW_CALL(throw_on, 202, http_params_types::params_required) }, appbase::exec_queue::read_write); + app().get_plugin().add_api({ + TEST_CONTROL_RW_CALL(swap_action, 202, http_params_types::params_required) + }, appbase::exec_queue::read_write); + } void test_control_api_plugin::plugin_shutdown() {} diff --git a/plugins/test_control_plugin/include/eosio/test_control_plugin/test_control_plugin.hpp b/plugins/test_control_plugin/include/eosio/test_control_plugin/test_control_plugin.hpp index 970e711673..3f772f4bc2 100644 --- a/plugins/test_control_plugin/include/eosio/test_control_plugin/test_control_plugin.hpp +++ b/plugins/test_control_plugin/include/eosio/test_control_plugin/test_control_plugin.hpp @@ -33,7 +33,17 @@ class read_write { }; empty throw_on(const throw_on_params& params) const; - private: + // produce a next block with `from` action replaced with `to` action + // requires Savanna to be active, this assumes blocks are is_proper_svnn_block + struct swap_action_params { + chain::name from; // replace from action in block to `to` action + chain::name to; + fc::crypto::private_key trx_priv_key; + fc::crypto::private_key blk_priv_key; + }; + empty swap_action(const swap_action_params& params) const; + +private: test_control_ptr my; }; @@ -68,3 +78,4 @@ class test_control_plugin : public plugin { FC_REFLECT(eosio::test_control_apis::empty, ) FC_REFLECT(eosio::test_control_apis::read_write::kill_node_on_producer_params, (producer)(where_in_sequence)(based_on_lib) ) FC_REFLECT(eosio::test_control_apis::read_write::throw_on_params, (signal)(exception) ) +FC_REFLECT(eosio::test_control_apis::read_write::swap_action_params, (from)(to)(trx_priv_key)(blk_priv_key) ) diff --git a/plugins/test_control_plugin/test_control_plugin.cpp b/plugins/test_control_plugin/test_control_plugin.cpp index 8402623e30..8644e56fa9 100644 --- a/plugins/test_control_plugin/test_control_plugin.cpp +++ b/plugins/test_control_plugin/test_control_plugin.cpp @@ -12,10 +12,11 @@ class test_control_plugin_impl { void kill_on_head(account_name prod, uint32_t where_in_seq); void set_throw_on_options(const test_control_apis::read_write::throw_on_params& throw_options); + void set_swap_action_options(const test_control_apis::read_write::swap_action_params& swap_options); private: void block_start(chain::block_num_type block_num); void accepted_block_header(const chain::block_id_type& id); - void accepted_block(const chain::block_id_type& id); + void accepted_block(const chain::block_id_type& id, const chain::signed_block_ptr& block); void irreversible_block(const chain::block_id_type& id); void applied_transaction(); void voted_block(); @@ -25,6 +26,9 @@ class test_control_plugin_impl { void reset_throw(); void process_next_block_state(const chain::block_id_type& id); + void swap_action_in_block(const chain::signed_block_ptr& b); + void reset_swap_action() { _swap_on_options = {}; } + chain::controller& _chain; struct kill_options { account_name _producer; @@ -35,7 +39,8 @@ class test_control_plugin_impl { bool _track_head{false}; } _kill_options; - test_control_apis::read_write::throw_on_params _throw_options; + test_control_apis::read_write::throw_on_params _throw_options; + test_control_apis::read_write::swap_action_params _swap_on_options; std::optional _block_start_connection; std::optional _accepted_block_header_connection; @@ -59,7 +64,7 @@ void test_control_plugin_impl::connect() { _accepted_block_connection = _chain.accepted_block().connect( [&]( const chain::block_signal_params& t ) { const auto& [ block, id ] = t; - accepted_block( id ); + accepted_block( id, block ); } ); _irreversible_block_connection.emplace( _chain.irreversible_block().connect( [&]( const chain::block_signal_params& t ) { @@ -96,6 +101,71 @@ void test_control_plugin_impl::reset_throw() { _throw_options = test_control_apis::read_write::throw_on_params{}; } +void test_control_plugin_impl::swap_action_in_block(const chain::signed_block_ptr& b) { + if (b->transactions.empty()) + return; + + bool found = std::find_if(b->transactions.cbegin(), b->transactions.cend(), [&](const auto& t) { + return std::visit(chain::overloaded{ + [](const transaction_id_type&) { return false; }, + [&](const chain::packed_transaction& pt) { + for (const auto& a : pt.get_transaction().actions) { + if (a.name == _swap_on_options.from) + return true; + } + return false; + } + }, t.trx); + }) != b->transactions.cend(); + if (!found) + return; + + if (!b->is_proper_svnn_block()) { + elog("Block is not a Savanna block, swap_action failed."); + return; + } + + auto copy_b = std::make_shared(b->clone()); + copy_b->previous = b->calculate_id(); + copy_b->block_extensions.clear(); // remove QC extension since header will claim same as previous block + copy_b->timestamp = b->timestamp.next(); + // swap out action + for (auto& t : copy_b->transactions) { + std::visit(chain::overloaded{ + [](const transaction_id_type&) {}, + [&](chain::packed_transaction& pt) { + for (auto& a : pt.get_transaction().actions) { + if (a.name == _swap_on_options.from) { + auto signed_tx = pt.get_signed_transaction(); + auto& act = signed_tx.actions.back(); + act.name = _swap_on_options.to; + // Re-sign the transaction + signed_tx.signatures.clear(); + signed_tx.sign(_swap_on_options.trx_priv_key, _chain.get_chain_id()); + // Replace the transaction + auto new_packed_tx = packed_transaction(signed_tx); + const_cast(pt) = std::move(new_packed_tx); + } + } + } + }, t.trx); + } + // Re-calculate the transaction merkle + std::deque trx_digests; + const auto& trxs = copy_b->transactions; + for( const auto& tr : trxs ) + trx_digests.emplace_back( tr.digest() ); + copy_b->transaction_mroot = chain::calculate_merkle( std::move(trx_digests) ); + // Re-sign the block + copy_b->producer_signature = _swap_on_options.blk_priv_key.sign(copy_b->calculate_id()); + + // will be processed on the next start_block if is_new_best_head + const auto&[is_new_best_head, bh] = _chain.accept_block(copy_b->calculate_id(), copy_b); + ilog("Swapped action ${f} to ${t}, is_new_best_head ${bh}, block ${bn}", + ("f", _swap_on_options.from)("t", _swap_on_options.to)("bh", is_new_best_head)("bn", bh ? bh->block_num() : 0)); + reset_swap_action(); +} + void test_control_plugin_impl::block_start(chain::block_num_type block_num) { if (_throw_options.signal == "block_start") throw_exception(); @@ -106,11 +176,13 @@ void test_control_plugin_impl::accepted_block_header(const chain::block_id_type& throw_exception(); } -void test_control_plugin_impl::accepted_block(const chain::block_id_type& id) { +void test_control_plugin_impl::accepted_block(const chain::block_id_type& id, const chain::signed_block_ptr& block) { if (_kill_options._track_head) process_next_block_state(id); if (_throw_options.signal == "accepted_block") throw_exception(); + if (!_swap_on_options.from.empty()) + swap_action_in_block(block); } void test_control_plugin_impl::irreversible_block(const chain::block_id_type& id) { @@ -185,12 +257,13 @@ void test_control_plugin_impl::kill_on_head(account_name prod, uint32_t where_in _kill_options._track_head = true; } -// ----------------- throw_on -------------------------------- - void test_control_plugin_impl::set_throw_on_options(const test_control_apis::read_write::throw_on_params& throw_options) { _throw_options = throw_options; } +void test_control_plugin_impl::set_swap_action_options(const test_control_apis::read_write::swap_action_params& swap_options) { + _swap_on_options = swap_options; +} test_control_plugin::test_control_plugin() = default; @@ -230,5 +303,11 @@ empty read_write::throw_on(const read_write::throw_on_params& params) const { return {}; } +empty read_write::swap_action(const read_write::swap_action_params& params) const { + ilog("received swap_action: ${p}", ("p", params)); + my->set_swap_action_options(params); + return {}; +} + } // namespace test_control_apis } // namespace eosio diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2f4dd643ed..b50e052d29 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,6 +89,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nodeos_contrl_c_test.py ${CMAKE_CURRE configure_file(${CMAKE_CURRENT_SOURCE_DIR}/read_only_trx_test.py ${CMAKE_CURRENT_BINARY_DIR}/read_only_trx_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/resource_monitor_plugin_test.py ${CMAKE_CURRENT_BINARY_DIR}/resource_monitor_plugin_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/light_validation_sync_test.py ${CMAKE_CURRENT_BINARY_DIR}/light_validation_sync_test.py COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/interrupt_trx_test.py ${CMAKE_CURRENT_BINARY_DIR}/interrupt_trx_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trace_plugin_test.py ${CMAKE_CURRENT_BINARY_DIR}/trace_plugin_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nested_container_multi_index_test.py ${CMAKE_CURRENT_BINARY_DIR}/nested_container_multi_index_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/auto_bp_peering_test.py ${CMAKE_CURRENT_BINARY_DIR}/auto_bp_peering_test.py COPYONLY) @@ -416,6 +417,8 @@ add_test(NAME light_validation_sync_test COMMAND tests/light_validation_sync_tes set_property(TEST light_validation_sync_test PROPERTY LABELS nonparallelizable_tests) add_test(NAME light_validation_sync_if_test COMMAND tests/light_validation_sync_test.py --activate-if -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST light_validation_sync_if_test PROPERTY LABELS nonparallelizable_tests) +add_test(NAME interrupt_trx_test COMMAND tests/interrupt_trx_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +set_property(TEST interrupt_trx_test PROPERTY LABELS nonparallelizable_tests) add_test(NAME auto_bp_peering_test COMMAND tests/auto_bp_peering_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST auto_bp_peering_test PROPERTY LABELS long_running_tests) diff --git a/tests/interrupt_trx_test.py b/tests/interrupt_trx_test.py new file mode 100755 index 0000000000..ceddfddfd1 --- /dev/null +++ b/tests/interrupt_trx_test.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import json +import signal + +from TestHarness import Account, Cluster, Node, ReturnType, TestHelper, Utils, WalletMgr +from TestHarness.TestHelper import AppArgs + +############################################################### +# interrupt_trx_test +# +# Test applying a block with an infinite trx and verify SIGTERM kill +# interrupts the transaction and aborts the block. +# +############################################################### + +# Parse command line arguments +args = TestHelper.parse_args({"-v","--dump-error-details","--leave-running","--keep-logs","--unshared"}) +Utils.Debug = args.v +dumpErrorDetails=args.dump_error_details +dontKill=args.leave_running +keepLogs=args.keep_logs + +EOSIO_ACCT_PRIVATE_DEFAULT_KEY = "5KQwrPbwdL6PhXujxW37FSSQZ1JiwsST4cqQzDeyXtP79zkvFD3" +EOSIO_ACCT_PUBLIC_DEFAULT_KEY = "EOS6MRyAjQq8ud7hVNYcfnVPJqcVpscN5So8BhtHuGYqET5GDW5CV" + +walletMgr=WalletMgr(True) +cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs) +cluster.setWalletMgr(walletMgr) + +testSuccessful = False +try: + TestHelper.printSystemInfo("BEGIN") + assert cluster.launch( + pnodes=1, + prodCount=1, + totalProducers=1, + totalNodes=2, + loadSystemContract=False, + activateIF=True, + extraNodeosArgs="--plugin eosio::test_control_api_plugin") + + prodNode = cluster.getNode(0) + validationNode = cluster.getNode(1) + + # Create a transaction to create an account + Utils.Print("create a new account payloadless from the producer node") + payloadlessAcc = Account("payloadless") + payloadlessAcc.ownerPublicKey = EOSIO_ACCT_PUBLIC_DEFAULT_KEY + payloadlessAcc.activePublicKey = EOSIO_ACCT_PUBLIC_DEFAULT_KEY + prodNode.createAccount(payloadlessAcc, cluster.eosioAccount) + + contractDir="unittests/test-contracts/payloadless" + wasmFile="payloadless.wasm" + abiFile="payloadless.abi" + Utils.Print("Publish payloadless contract") + trans = prodNode.publishContract(payloadlessAcc, contractDir, wasmFile, abiFile, waitForTransBlock=True) + + contract="payloadless" + action="doit" + data="{}" + opts="--permission payloadless@active" + trans=prodNode.pushMessage(contract, action, data, opts) + assert trans and trans[0], "Failed to push doit action" + + action="doitslow" + trans=prodNode.pushMessage(contract, action, data, opts) + assert trans and trans[0], "Failed to push doitslow action" + + action="doitforever" + trans=prodNode.pushMessage(contract, action, data, opts, silentErrors=True) + assert trans and not trans[0], "push doitforever action did not fail as expected" + + prodNode.processUrllibRequest("test_control", "swap_action", {"from": "doitslow", "to": "doitforever"}) + + action="doitslow" + trans=prodNode.pushMessage(contract, action, data, opts) + assert trans and trans[0], "Failed to push doitslow action" + + prodNode.waitForProducer("defproducera") + + # Needs https://github.com/AntelopeIO/spring/issues/876 + # prodNode.processUrllibRequest("test_control", "swap_action", + # {"from":"doitslow", "to":"doitforever", + # "trx_priv_key":EOSIO_ACCT_PRIVATE_DEFAULT_KEY, + # "blk_priv_key":cluster.defproduceraAccount.activePrivateKey}) + # + # assert not prodNode.waitForHeadToAdvance(3), f"prodNode did advance head after doitforever action" + # + # prodNode.popenProc.send_signal(signal.SIGTERM) + # + # assert not prodNode.verifyAlive(), "prodNode did not exit from SIGTERM" + + testSuccessful = True +finally: + TestHelper.shutdown(cluster, walletMgr, testSuccessful, dumpErrorDetails) + +exitCode = 0 if testSuccessful else 1 +exit(exitCode) diff --git a/unittests/test-contracts/payloadless/payloadless.abi b/unittests/test-contracts/payloadless/payloadless.abi index aafa35c171..f78c5c42f0 100644 --- a/unittests/test-contracts/payloadless/payloadless.abi +++ b/unittests/test-contracts/payloadless/payloadless.abi @@ -8,6 +8,11 @@ "base": "", "fields": [] }, + { + "name": "doitforever", + "base": "", + "fields": [] + }, { "name": "doitslow", "base": "", @@ -20,6 +25,11 @@ "type": "doit", "ricardian_contract": "" }, + { + "name": "doitforever", + "type": "doitforever", + "ricardian_contract": "" + }, { "name": "doitslow", "type": "doitslow", diff --git a/unittests/test-contracts/payloadless/payloadless.cpp b/unittests/test-contracts/payloadless/payloadless.cpp index e9556ca5ed..d7dcc99d4d 100644 --- a/unittests/test-contracts/payloadless/payloadless.cpp +++ b/unittests/test-contracts/payloadless/payloadless.cpp @@ -50,3 +50,17 @@ void payloadless::doitslow() { } } +void payloadless::doitforever() { + print("Im a payloadless forever action"); + constexpr size_t max_cpu_prime = std::numeric_limits::max(); + + while (true) { + for (size_t p = 2; p <= max_cpu_prime; p += 1) { + if (is_prime(p) && is_mersenne_prime(p)) { + // We need to keep an eye on this to make sure it doesn't get optimized out. So far so good. + //eosio::print_f(" %u", p); + } + } + } +} + diff --git a/unittests/test-contracts/payloadless/payloadless.hpp b/unittests/test-contracts/payloadless/payloadless.hpp index 0fea87a29b..10f696fded 100644 --- a/unittests/test-contracts/payloadless/payloadless.hpp +++ b/unittests/test-contracts/payloadless/payloadless.hpp @@ -11,4 +11,7 @@ class [[eosio::contract]] payloadless : public eosio::contract { [[eosio::action]] void doitslow(); + + [[eosio::action]] + void doitforever(); }; diff --git a/unittests/test-contracts/payloadless/payloadless.wasm b/unittests/test-contracts/payloadless/payloadless.wasm old mode 100644 new mode 100755 index 53cc07f5603f905f772a3a84bbac90ce58f901b1..6c8396afb64ee4691b9380a46ca213cc3ad5f5ac GIT binary patch delta 408 zcmX|)F-rqM5QS%E_dLzsnImclsdf`B>@6&BB@qLH*xMZO1Y_czQN)0F`2$Meu<$2n zA%cjY7V!sYX=P!fg5Xa39IGKk@!NJRl%k&(laUy-jd|}dB=)QiwT;9E( zJb|QD?n{v9O{MQxzP9PcPI|~F!9z;)S06mkq>%aKOVqUJ8n4D3MFUzgL{%4PsNQ;R ziJetoIB&Xc1j#NGcTniAnFGazWE^l0sDKlA2XB#@+*(%XA4B2BvVGKEwdWz|H rW)Z9DyqKA-wPYaUpwotHcxF5DCp!lx7dtlt4-X3~V*w*0BP%l_10$Olb3LPD!vO~FdIrb02#Q^Z zi7huZH@~QoiGek-pdhD`frsm?5GM Date: Thu, 31 Oct 2024 09:49:51 -0500 Subject: [PATCH 06/17] GH-985 Better logic for determining if deadline timer was because of interrupt --- libraries/chain/transaction_context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/chain/transaction_context.cpp b/libraries/chain/transaction_context.cpp index 279b64a982..420b22ebec 100644 --- a/libraries/chain/transaction_context.cpp +++ b/libraries/chain/transaction_context.cpp @@ -459,10 +459,10 @@ namespace eosio::chain { return; auto now = fc::time_point::now(); - if (explicit_billed_cpu_time) { + if (explicit_billed_cpu_time && block_deadline > now) { EOS_THROW( interrupt_exception, "interrupt signaled, ran ${bt}us, start ${s}", ("bt", now - pseudo_start)("s", start) ); - } else if( deadline_exception_code == deadline_exception::code_value ) { + } else if( explicit_billed_cpu_time || deadline_exception_code == deadline_exception::code_value ) { EOS_THROW( deadline_exception, "deadline exceeded ${billing_timer}us", ("billing_timer", now - pseudo_start)("now", now)("deadline", _deadline)("start", start) ); } else if( deadline_exception_code == block_cpu_usage_exceeded::code_value ) { From d0ea755794df1a64381bbc4514acc571ec2c6cc8 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 31 Oct 2024 09:50:21 -0500 Subject: [PATCH 07/17] GH-985 Avoid multiple expiration callback calls. --- .../chain/platform_timer_asio_fallback.cpp | 22 +++++++------ libraries/chain/platform_timer_kqueue.cpp | 22 +++++++------ libraries/chain/platform_timer_posix.cpp | 32 +++++++++++-------- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/libraries/chain/platform_timer_asio_fallback.cpp b/libraries/chain/platform_timer_asio_fallback.cpp index 7ec808e128..b82552e412 100644 --- a/libraries/chain/platform_timer_asio_fallback.cpp +++ b/libraries/chain/platform_timer_asio_fallback.cpp @@ -57,36 +57,40 @@ platform_timer::~platform_timer() { void platform_timer::start(fc::time_point tp) { if(tp == fc::time_point::maximum()) { - expired = 0; + expired = false; return; } fc::microseconds x = tp.time_since_epoch() - fc::time_point::now().time_since_epoch(); if(x.count() <= 0) - expired = 1; + expired = true; else { #if 0 std::promise p; auto f = p.get_future(); checktime_ios->post([&p,this]() { - expired = 0; + expired = false; p.set_value(); }); f.get(); #endif - expired = 0; + expired = false; my->timer->expires_after(std::chrono::microseconds(x.count())); my->timer->async_wait([this](const boost::system::error_code& ec) { if(ec) return; - expired = 1; - call_expiration_callback(); + bool expected = false; + if (expired.compare_exchange_strong(expected, true)) { + call_expiration_callback(); + } }); } } void platform_timer::expire_now() { - expired = 1; - call_expiration_callback(); + bool expected = false; + if (expired.compare_exchange_strong(expected, true)) { + call_expiration_callback(); + } } void platform_timer::stop() { @@ -94,7 +98,7 @@ void platform_timer::stop() { return; my->timer->cancel(); - expired = 1; + expired = true; } }} diff --git a/libraries/chain/platform_timer_kqueue.cpp b/libraries/chain/platform_timer_kqueue.cpp index 3e9dc68104..823428c804 100644 --- a/libraries/chain/platform_timer_kqueue.cpp +++ b/libraries/chain/platform_timer_kqueue.cpp @@ -58,8 +58,10 @@ platform_timer::platform_timer() { if(c == 1 && anEvent.filter == EVFILT_TIMER) { platform_timer* self = (platform_timer*)anEvent.udata; - self->expired = 1; - self->call_expiration_callback(); + bool expected = false; + if (self->expired.compare_exchange_strong(expected, true)) { + self->call_expiration_callback(); + } } else if(c == 1 && anEvent.filter == EVFILT_USER) return; @@ -90,25 +92,27 @@ platform_timer::~platform_timer() { void platform_timer::start(fc::time_point tp) { if(tp == fc::time_point::maximum()) { - expired = 0; + expired = false; return; } fc::microseconds x = tp.time_since_epoch() - fc::time_point::now().time_since_epoch(); if(x.count() <= 0) - expired = 1; + expired = true; else { struct kevent64_s aTimerEvent; EV_SET64(&aTimerEvent, my->timerid, EVFILT_TIMER, EV_ADD|EV_ENABLE|EV_ONESHOT, NOTE_USECONDS|NOTE_CRITICAL, x.count(), (uint64_t)this, 0, 0); - expired = 0; + expired = false; if(kevent64(kqueue_fd, &aTimerEvent, 1, NULL, 0, KEVENT_FLAG_IMMEDIATE, NULL) != 0) - expired = 1; + expired = true; } } void platform_timer::expire_now() { - expired = 1; - call_expiration_callback(); + bool expected = false; + if (expired.compare_exchange_strong(expected, true)) { + call_expiration_callback(); + } } void platform_timer::stop() { @@ -118,7 +122,7 @@ void platform_timer::stop() { struct kevent64_s stop_timer_event; EV_SET64(&stop_timer_event, my->timerid, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0); kevent64(kqueue_fd, &stop_timer_event, 1, NULL, 0, KEVENT_FLAG_IMMEDIATE, NULL); - expired = 1; + expired = true; } }} diff --git a/libraries/chain/platform_timer_posix.cpp b/libraries/chain/platform_timer_posix.cpp index 0a50ebcade..3702333c32 100644 --- a/libraries/chain/platform_timer_posix.cpp +++ b/libraries/chain/platform_timer_posix.cpp @@ -5,12 +5,14 @@ #include #include +#include #include #include #include +#include -namespace eosio { namespace chain { +namespace eosio::chain { static_assert(std::atomic_bool::is_always_lock_free, "Only lock-free atomics AS-safe."); @@ -19,8 +21,10 @@ struct platform_timer::impl { static void sig_handler(int, siginfo_t* si, void*) { platform_timer* self = (platform_timer*)si->si_value.sival_ptr; - self->expired = 1; - self->call_expiration_callback(); + bool expected = false; + if (self->expired.compare_exchange_strong(expected, true)) { + self->call_expiration_callback(); + } } }; @@ -28,9 +32,9 @@ platform_timer::platform_timer() { static_assert(sizeof(impl) <= fwd_size); static bool initialized; - static std::mutex initalized_mutex; + static std::mutex initialized_mutex; - if(std::lock_guard guard(initalized_mutex); !initialized) { + if(std::lock_guard guard(initialized_mutex); !initialized) { struct sigaction act; sigemptyset(&act.sa_mask); act.sa_sigaction = impl::sig_handler; @@ -55,25 +59,27 @@ platform_timer::~platform_timer() { void platform_timer::start(fc::time_point tp) { if(tp == fc::time_point::maximum()) { - expired = 0; + expired = false; return; } fc::microseconds x = tp.time_since_epoch() - fc::time_point::now().time_since_epoch(); if(x.count() <= 0) - expired = 1; + expired = true; else { time_t secs = x.count() / 1000000; long nsec = (x.count() - (secs*1000000)) * 1000; struct itimerspec enable = {{0, 0}, {secs, nsec}}; - expired = 0; + expired = false; if(timer_settime(my->timerid, 0, &enable, NULL) != 0) - expired = 1; + expired = true; } } void platform_timer::expire_now() { - expired = 1; - call_expiration_callback(); + bool expected = false; + if (expired.compare_exchange_strong(expected, true)) { + call_expiration_callback(); + } } void platform_timer::stop() { @@ -81,7 +87,7 @@ void platform_timer::stop() { return; struct itimerspec disable = {{0, 0}, {0, 0}}; timer_settime(my->timerid, 0, &disable, NULL); - expired = 1; + expired = true; } -}} +} From 078781af1794767cbf8b6d13275d9fe07104fe55 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 31 Oct 2024 13:54:59 -0500 Subject: [PATCH 08/17] GH-985 Better logging for interrupted apply_block trx --- libraries/chain/controller.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/libraries/chain/controller.cpp b/libraries/chain/controller.cpp index 8fa55cadf0..474de6503e 100644 --- a/libraries/chain/controller.cpp +++ b/libraries/chain/controller.cpp @@ -3813,7 +3813,7 @@ struct controller_impl { if( transaction_failed && !transaction_can_fail) { if (trace->except->code() == interrupt_exception::code_value) { - ilog("Interrupt of trx: ${t}", ("t", *trace)); + ilog("Interrupt of trx id: ${id}", ("id", trace->id)); } else { edump((*trace)); } @@ -4370,7 +4370,15 @@ struct controller_impl { log_irreversible(); transition_to_savanna_if_needed(); return controller::apply_blocks_result::complete; - } FC_LOG_AND_RETHROW( ) + } catch (fc::exception& e) { + if (e.code() != interrupt_exception::code_value) { + wlog("${d}", ("d",e.to_detail_string())); + FC_RETHROW_EXCEPTION(e, warn, "rethrow"); + } + throw; + } catch (...) { + try { throw; } FC_LOG_AND_RETHROW() + } } controller::apply_blocks_result maybe_apply_blocks( const forked_callback_t& forked_cb, const trx_meta_cache_lookup& trx_lookup ) From 7fef19b97fdd8b3d349e725439f76e13754128c0 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 1 Nov 2024 08:16:02 -0500 Subject: [PATCH 09/17] Update to appbase with signal handling on different thread --- libraries/appbase | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/appbase b/libraries/appbase index 13090992fe..5e09479dde 160000 --- a/libraries/appbase +++ b/libraries/appbase @@ -1 +1 @@ -Subproject commit 13090992fe675234716fa69d8bd037dbc4755787 +Subproject commit 5e09479dde4ce251b2dde9f17537ea234e599df6 From f8f327c0cb057331eca932fbe3a923a1fcd0e2b3 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 1 Nov 2024 08:30:23 -0500 Subject: [PATCH 10/17] GH-985 Update test now that appbase supports interrupt off main thread --- tests/interrupt_trx_test.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/interrupt_trx_test.py b/tests/interrupt_trx_test.py index ceddfddfd1..b39172f4c1 100755 --- a/tests/interrupt_trx_test.py +++ b/tests/interrupt_trx_test.py @@ -79,17 +79,16 @@ prodNode.waitForProducer("defproducera") - # Needs https://github.com/AntelopeIO/spring/issues/876 - # prodNode.processUrllibRequest("test_control", "swap_action", - # {"from":"doitslow", "to":"doitforever", - # "trx_priv_key":EOSIO_ACCT_PRIVATE_DEFAULT_KEY, - # "blk_priv_key":cluster.defproduceraAccount.activePrivateKey}) - # - # assert not prodNode.waitForHeadToAdvance(3), f"prodNode did advance head after doitforever action" - # - # prodNode.popenProc.send_signal(signal.SIGTERM) - # - # assert not prodNode.verifyAlive(), "prodNode did not exit from SIGTERM" + prodNode.processUrllibRequest("test_control", "swap_action", + {"from":"doitslow", "to":"doitforever", + "trx_priv_key":EOSIO_ACCT_PRIVATE_DEFAULT_KEY, + "blk_priv_key":cluster.defproduceraAccount.activePrivateKey}) + + assert not prodNode.waitForHeadToAdvance(3), f"prodNode did advance head after doitforever action" + + prodNode.interruptAndVerifyExitStatus() + + assert not prodNode.verifyAlive(), "prodNode did not exit from SIGINT" testSuccessful = True finally: From 8012d037b0ac8cd90850da14ce62128439912029 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 1 Nov 2024 09:49:41 -0500 Subject: [PATCH 11/17] GH-985 chain is not available until after initialize --- programs/nodeos/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/programs/nodeos/main.cpp b/programs/nodeos/main.cpp index a3533da6ec..bf161cf79a 100644 --- a/programs/nodeos/main.cpp +++ b/programs/nodeos/main.cpp @@ -168,12 +168,6 @@ int main(int argc, char** argv) uint32_t short_hash = 0; fc::from_hex(eosio::version::version_hash(), (char*)&short_hash, sizeof(short_hash)); - app->set_stop_executor_cb([&app]() { - ilog("appbase quit called"); - app->get_io_context().stop(); - auto& chain = app->get_plugin().chain(); - chain.interrupt_transaction(); - }); app->set_version(htonl(short_hash)); app->set_version_string(eosio::version::version_client()); app->set_full_version_string(eosio::version::version_full()); @@ -194,6 +188,12 @@ int main(int argc, char** argv) } return INITIALIZE_FAIL; } + controller& chain = app->get_plugin().chain(); + app->set_stop_executor_cb([&app, &chain]() { + ilog("appbase quit called"); + chain.interrupt_transaction(); + app->get_io_context().stop(); + }); if (auto resmon_plugin = app->find_plugin()) { resmon_plugin->monitor_directory(app->data_dir()); } else { From cae2e950552a2b8ce430271477f956ffada749fd Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Fri, 1 Nov 2024 11:29:52 -0400 Subject: [PATCH 12/17] Fix a typo --- tests/ship_restart_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ship_restart_test.py b/tests/ship_restart_test.py index fbc410c50d..9637d2313e 100755 --- a/tests/ship_restart_test.py +++ b/tests/ship_restart_test.py @@ -59,14 +59,14 @@ def equalFiles(file1, file2): return True # Verifies that SHiP should fail to restart with a corrupted first entry header -def corruptedHeaderTest(pos, curruptedValue, shipNode): +def corruptedHeaderTest(pos, corruptedValue, shipNode): # restore log and index shutil.copyfile(origStateHistoryLog, stateHistoryLog) shutil.copyfile(origStateHistoryIndex, stateHistoryIndex) with open(stateHistoryLog, 'rb+') as f: # opened as binary file f.seek(pos) # seek to the position to corrupt - f.write(curruptedValue) # corrupt it + f.write(corruptedValue) # corrupt it isRelaunchSuccess = shipNode.relaunch() assert not isRelaunchSuccess, "SHiP node should have failed to relaunch" From bb4d159218eb5fb6603f0c82702800c7bd4dae90 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 1 Nov 2024 10:38:54 -0500 Subject: [PATCH 13/17] GH-985 Remove duplicate code --- libraries/chain/platform_timer_asio_fallback.cpp | 14 +------------- libraries/chain/platform_timer_kqueue.cpp | 5 +---- libraries/chain/platform_timer_posix.cpp | 5 +---- 3 files changed, 3 insertions(+), 21 deletions(-) diff --git a/libraries/chain/platform_timer_asio_fallback.cpp b/libraries/chain/platform_timer_asio_fallback.cpp index b82552e412..547bc5dcc7 100644 --- a/libraries/chain/platform_timer_asio_fallback.cpp +++ b/libraries/chain/platform_timer_asio_fallback.cpp @@ -64,24 +64,12 @@ void platform_timer::start(fc::time_point tp) { if(x.count() <= 0) expired = true; else { -#if 0 - std::promise p; - auto f = p.get_future(); - checktime_ios->post([&p,this]() { - expired = false; - p.set_value(); - }); - f.get(); -#endif expired = false; my->timer->expires_after(std::chrono::microseconds(x.count())); my->timer->async_wait([this](const boost::system::error_code& ec) { if(ec) return; - bool expected = false; - if (expired.compare_exchange_strong(expected, true)) { - call_expiration_callback(); - } + expire_now(); }); } } diff --git a/libraries/chain/platform_timer_kqueue.cpp b/libraries/chain/platform_timer_kqueue.cpp index 823428c804..16f076a0cb 100644 --- a/libraries/chain/platform_timer_kqueue.cpp +++ b/libraries/chain/platform_timer_kqueue.cpp @@ -58,10 +58,7 @@ platform_timer::platform_timer() { if(c == 1 && anEvent.filter == EVFILT_TIMER) { platform_timer* self = (platform_timer*)anEvent.udata; - bool expected = false; - if (self->expired.compare_exchange_strong(expected, true)) { - self->call_expiration_callback(); - } + self->expire_now(); } else if(c == 1 && anEvent.filter == EVFILT_USER) return; diff --git a/libraries/chain/platform_timer_posix.cpp b/libraries/chain/platform_timer_posix.cpp index 3702333c32..4388fa18b9 100644 --- a/libraries/chain/platform_timer_posix.cpp +++ b/libraries/chain/platform_timer_posix.cpp @@ -21,10 +21,7 @@ struct platform_timer::impl { static void sig_handler(int, siginfo_t* si, void*) { platform_timer* self = (platform_timer*)si->si_value.sival_ptr; - bool expected = false; - if (self->expired.compare_exchange_strong(expected, true)) { - self->call_expiration_callback(); - } + self->expire_now(); } }; From 7cc8ea4ec67a512551ed3f9968c861550d1ac159 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Fri, 1 Nov 2024 12:05:20 -0400 Subject: [PATCH 14/17] Add file open mode as an argument to Utils.compareFiles(); use Utils.compareFiles() instead of own equalFiles() --- tests/TestHarness/testUtils.py | 6 +++--- tests/ship_restart_test.py | 35 ++++++++-------------------------- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/tests/TestHarness/testUtils.py b/tests/TestHarness/testUtils.py index 3253715c60..f44f0b8ead 100644 --- a/tests/TestHarness/testUtils.py +++ b/tests/TestHarness/testUtils.py @@ -514,9 +514,9 @@ def compare(obj1,obj2,context): return "comparison of %s type is not supported, context=%s" % (typeName,context) @staticmethod - def compareFiles(file1: str, file2: str): - f1 = open(file1) - f2 = open(file2) + def compareFiles(file1: str, file2: str, mode="r"): + f1 = open(file1, mode) + f2 = open(file2, mode) i = 0 same = True diff --git a/tests/ship_restart_test.py b/tests/ship_restart_test.py index 9637d2313e..de059460f3 100755 --- a/tests/ship_restart_test.py +++ b/tests/ship_restart_test.py @@ -39,25 +39,6 @@ origStateHistoryIndex = "" stateHistoryIndex = "" -# Returns True if file1 and file2 contain the same content -def equalFiles(file1, file2): - # not the same if sizes are different - if os.path.getsize(file1) != os.path.getsize(file2): - return False - - readSize=1024 - with open(file1, 'rb') as f1, open(file2, 'rb') as f2: - while True: - bytes1 = f1.read(readSize) - bytes2 = f2.read(readSize) - - if bytes1 != bytes2: - return False - - # end of both files - if not bytes1: - return True - # Verifies that SHiP should fail to restart with a corrupted first entry header def corruptedHeaderTest(pos, corruptedValue, shipNode): # restore log and index @@ -121,8 +102,8 @@ def corruptedHeaderTest(pos, corruptedValue, shipNode): isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" - assert equalFiles(stateHistoryLog, origStateHistoryLog) # log unchanged - assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index regenerated + assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log unchanged + assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index regenerated shipNode.kill(signal.SIGTERM) # shut down ship node for next test @@ -139,8 +120,8 @@ def corruptedHeaderTest(pos, corruptedValue, shipNode): isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" - assert equalFiles(stateHistoryLog, origStateHistoryLog) - assert equalFiles(stateHistoryIndex, origStateHistoryIndex) + assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") + assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") ''' #-------- Truncate index file. It should be regenerated @@ -158,8 +139,8 @@ def corruptedHeaderTest(pos, corruptedValue, shipNode): isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" - assert equalFiles(stateHistoryLog, origStateHistoryLog) # log file unchanged - assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index file regenerated + assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log file unchanged + assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index file regenerated shipNode.kill(signal.SIGTERM) # shut down it for next test @@ -177,8 +158,8 @@ def corruptedHeaderTest(pos, corruptedValue, shipNode): isRelaunchSuccess = shipNode.relaunch() assert isRelaunchSuccess, "Failed to relaunch shipNode" - assert equalFiles(stateHistoryLog, origStateHistoryLog) # log file not changed - assert equalFiles(stateHistoryIndex, origStateHistoryIndex) # index file regenerated + assert Utils.compareFiles(stateHistoryLog, origStateHistoryLog, mode="rb") # log file not changed + assert Utils.compareFiles(stateHistoryIndex, origStateHistoryIndex, mode="rb") # index file regenerated shipNode.kill(signal.SIGTERM) # shut down it for next test From e636de3e89917e0778195aa40f44c7f95d1c7ed8 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 4 Nov 2024 10:00:59 -0600 Subject: [PATCH 15/17] GH-1003 Advance forkdb root when pending lib greater than head --- libraries/chain/controller.cpp | 9 ++++++--- unittests/fork_db_tests.cpp | 8 ++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/libraries/chain/controller.cpp b/libraries/chain/controller.cpp index bda29fa837..07c7c2133a 100644 --- a/libraries/chain/controller.cpp +++ b/libraries/chain/controller.cpp @@ -1497,9 +1497,12 @@ struct controller_impl { assert(!irreversible_mode() || forkdb.head()); const auto& head_id = irreversible_mode() ? forkdb.head()->id() : chain_head.id(); // verifies lib is on head branch, otherwise returns an empty branch - // The new lib needs to be on the head branch because the forkdb.advance_root() below could purge blocks that - // would be needed to be re-applied on a fork switch from the exiting chain_head. - auto branch = forkdb.fetch_branch(head_id, new_lib_id); + // The new lib needs to be on the head branch because the forkdb.advance_root() below could purge blocks that + // would be needed to be re-applied on a fork switch from the exiting chain_head. + // Pending LIB can be greater than chain head, for example when syncing, in that case fetch branch from the + // pending LIB. If the pending LIB not found on the head branch then fetch_branch returns an empty branch. + // Otherwise fetch_branch will return from chain_head to root iff chain_head on pending LIB branch. + auto branch = new_lib_num <= chain_head.block_num() ? forkdb.fetch_branch(head_id, new_lib_id) : forkdb.fetch_branch(new_lib_id, head_id); try { auto should_process = [&](auto& bsp) { // Only make irreversible blocks that have been validated. Blocks in the fork database may not be on our current best head diff --git a/unittests/fork_db_tests.cpp b/unittests/fork_db_tests.cpp index 410fe3c63d..31e9fed7a9 100644 --- a/unittests/fork_db_tests.cpp +++ b/unittests/fork_db_tests.cpp @@ -138,6 +138,14 @@ BOOST_FIXTURE_TEST_CASE(add_remove_test, generate_forkdb_state) try { BOOST_REQUIRE(branch.size() == 2); BOOST_TEST(branch[0] == bsp12a); BOOST_TEST(branch[1] == bsp11a); + + // test fetch branch when lib is greater than head + branch = forkdb.fetch_branch(bsp13b->id(), bsp12a->id()); + BOOST_TEST(branch.empty()); + branch = forkdb.fetch_branch(bsp13b->id(), bsp12b->id()); + BOOST_REQUIRE(branch.size() == 2); + BOOST_TEST(branch[0] == bsp12b); + BOOST_TEST(branch[1] == bsp11b); } FC_LOG_AND_RETHROW(); From cef82331be203a4f4d59e7853f75a9a491b679c0 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 4 Nov 2024 11:45:45 -0600 Subject: [PATCH 16/17] GH-1003 Use forkdb head in irreversible mode --- libraries/chain/controller.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/chain/controller.cpp b/libraries/chain/controller.cpp index 07c7c2133a..161aac3191 100644 --- a/libraries/chain/controller.cpp +++ b/libraries/chain/controller.cpp @@ -1496,13 +1496,14 @@ struct controller_impl { auto mark_branch_irreversible = [&, this](auto& forkdb) { assert(!irreversible_mode() || forkdb.head()); const auto& head_id = irreversible_mode() ? forkdb.head()->id() : chain_head.id(); + const auto head_num = block_header::num_from_id(head_id); // verifies lib is on head branch, otherwise returns an empty branch // The new lib needs to be on the head branch because the forkdb.advance_root() below could purge blocks that // would be needed to be re-applied on a fork switch from the exiting chain_head. // Pending LIB can be greater than chain head, for example when syncing, in that case fetch branch from the // pending LIB. If the pending LIB not found on the head branch then fetch_branch returns an empty branch. // Otherwise fetch_branch will return from chain_head to root iff chain_head on pending LIB branch. - auto branch = new_lib_num <= chain_head.block_num() ? forkdb.fetch_branch(head_id, new_lib_id) : forkdb.fetch_branch(new_lib_id, head_id); + auto branch = new_lib_num <= head_num ? forkdb.fetch_branch(head_id, new_lib_id) : forkdb.fetch_branch(new_lib_id, head_id); try { auto should_process = [&](auto& bsp) { // Only make irreversible blocks that have been validated. Blocks in the fork database may not be on our current best head From b519e9c6e403762cee6e63a36780cd855b5c71f7 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Tue, 5 Nov 2024 15:30:48 -0600 Subject: [PATCH 17/17] Update to appbase main with signal handling off the main thread --- libraries/appbase | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/appbase b/libraries/appbase index 5e09479dde..0b2f151bea 160000 --- a/libraries/appbase +++ b/libraries/appbase @@ -1 +1 @@ -Subproject commit 5e09479dde4ce251b2dde9f17537ea234e599df6 +Subproject commit 0b2f151beaceb1340e537bb43766428391ceddc7