From 29c762666f8428fbdc5f4d5948679109f5a71b58 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 16 May 2024 19:56:13 -0500 Subject: [PATCH 1/8] GH-13 Add rmArgs option to relaunch. Add removeFinalizersSafetyFile method. --- tests/TestHarness/Node.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/TestHarness/Node.py b/tests/TestHarness/Node.py index 385f5a8f0a..f548766f2f 100644 --- a/tests/TestHarness/Node.py +++ b/tests/TestHarness/Node.py @@ -360,7 +360,8 @@ def rmFromCmd(self, matchValue: str): # pylint: disable=too-many-locals # If nodeosPath is equal to None, it will use the existing nodeos path - def relaunch(self, chainArg=None, newChain=False, skipGenesis=True, timeout=Utils.systemWaitTimeout, addSwapFlags=None, nodeosPath=None, waitForTerm=False): + def relaunch(self, chainArg=None, newChain=False, skipGenesis=True, timeout=Utils.systemWaitTimeout, + addSwapFlags=None, rmArgs=None, nodeosPath=None, waitForTerm=False): assert(self.pid is None) assert(self.killed) @@ -370,6 +371,10 @@ def relaunch(self, chainArg=None, newChain=False, skipGenesis=True, timeout=Util cmdArr=self.cmd[:] if nodeosPath: cmdArr[0] = nodeosPath toAddOrSwap=copy.deepcopy(addSwapFlags) if addSwapFlags is not None else {} + if rmArgs is not None: + for v in shlex.split(rmArgs): + i = cmdArr.index(v) + cmdArr.pop(i) if not newChain: if skipGenesis: try: @@ -558,6 +563,11 @@ def removeReversibleBlks(self): reversibleBlks = os.path.join(dataDir, "blocks", "reversible") shutil.rmtree(reversibleBlks, ignore_errors=True) + def removeFinalizersSafetyFile(self): + dataDir = Utils.getNodeDataDir(self.nodeId) + finalizersDir = os.path.join(dataDir, "finalizers") + shutil.rmtree(finalizersDir, ignore_errors=True) + @staticmethod def findStderrFiles(path): files=[] From 04ef5aa980d11fd4c05034f2be1227f5c2acc097 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 16 May 2024 19:56:44 -0500 Subject: [PATCH 2/8] GH-13 Add disaster recovery scenario 3 --- tests/disaster_recovery_3.py | 131 +++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100755 tests/disaster_recovery_3.py diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py new file mode 100755 index 0000000000..27f1e56129 --- /dev/null +++ b/tests/disaster_recovery_3.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +import os +import shutil +import signal +import time +from TestHarness import Cluster, TestHelper, Utils, WalletMgr +from TestHarness.Node import BlockType + +############################################################### +# disaster_recovery - Scenario 3 +# +# Create integration test with 4 nodes (A, B, C, and D) which each have their own producer and finalizer. The finalizer +# policy consists of the four finalizers with a threshold of 3. The proposer policy involves all four proposers. +# +# - At least two of the four nodes should have a LIB N and a finalizer safety information file that locks on a block +# after N. The other two nodes should have a LIB that is less than or equal to block N. +# +# All nodes are shut down. The reversible blocks on all nodes is deleted. Restart all nodes from an earlier snapshot. +# +# All nodes eventually sync up to block N. Some nodes will consider block N to LIB but others may not. +# +# Not enough finalizers should be voting because of the lock in their finalizer safety information file. Verify that +# LIB does not advance on any node. +# +# Cleanly shut down all nodes and delete their finalizer safety information files. Then restart the nodes. +# +# Verify that LIB advances on all nodes and they all agree on the LIB. In particular, verify that block N is the +# same ID on all nodes as the one before nodes were first shutdown. +# +############################################################### + +Print=Utils.Print +errorExit=Utils.errorExit + +args=TestHelper.parse_args({"-d","--keep-logs","--dump-error-details","-v","--leave-running","--unshared"}) +pnodes=4 +delay=args.d +debug=args.v +prod_count = 1 # per node prod count +total_nodes=pnodes +dumpErrorDetails=args.dump_error_details + +Utils.Debug=debug +testSuccessful=False + +cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs) +walletMgr=WalletMgr(True, keepRunning=args.leave_running, keepLogs=args.keep_logs) + +try: + TestHelper.printSystemInfo("BEGIN") + + cluster.setWalletMgr(walletMgr) + + Print(f'producing nodes: {pnodes}, delay between nodes launch: {delay} second{"s" if delay != 1 else ""}') + + Print("Stand up cluster") + if cluster.launch(pnodes=pnodes, totalNodes=total_nodes, totalProducers=pnodes, delay=delay, loadSystemContract=False, + activateIF=True, biosFinalizer=False) is False: + errorExit("Failed to stand up eos cluster.") + + assert cluster.biosNode.getInfo(exitOnError=True)["head_block_producer"] != "eosio", "launch should have waited for production to change" + cluster.biosNode.kill(signal.SIGTERM) + cluster.waitOnClusterSync(blockAdvancing=5) + + node0 = cluster.getNode(0) # A + node1 = cluster.getNode(1) # B + node2 = cluster.getNode(2) # C + node3 = cluster.getNode(3) # D + + Print("Create snapshot (node 0)") + ret = node0.createSnapshot() + assert ret is not None, "Snapshot creation failed" + ret_head_block_num = ret["payload"]["head_block_num"] + Print(f"Snapshot head block number {ret_head_block_num}") + + Print("Wait for snapshot node lib to advance") + assert node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib), "Node0 did not advance to make snapshot block LIB" + assert node1.waitForLibToAdvance(), "Node1 did not advance LIB after snapshot of Node0" + + assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot" + currentLIB = node0.getIrreversibleBlockNum() + libBlock = node0.getBlock(currentLIB) + Print(f"Lib Block: {libBlock}") + + Print("Shutdown all nodes") + for node in [node0, node1, node2, node3]: + node.kill(signal.SIGTERM) + for node in [node0, node1, node2, node3]: + assert not node.verifyAlive(), "Node did not shutdown" + + Print("Remove reversible blocks and state, but not finalizers safety data") + for node in [node0, node1, node2, node3]: + node.removeReversibleBlks() + node.removeState() + + Print("Restart nodes from snapshot") + for i in range(4): + isRelaunchSuccess = cluster.getNode(i).relaunch(chainArg=" -e --snapshot {}".format(node0.getLatestSnapshot())) + assert isRelaunchSuccess, f"node {i} relaunch from snapshot failed" + + Print("Verify LIB does not advance on any node") + for node in [node0, node1, node2, node3]: + assert not node.waitForLibToAdvance(), "Node advanced LIB after relaunch when it should not" + + Print("Shutdown all nodes to remove finalizer safety data") + for node in [node0, node1, node2, node3]: + node.kill(signal.SIGTERM) + for node in [node0, node1, node2, node3]: + assert not node.verifyAlive(), "Node did not shutdown" + + for node in [node0, node1, node2, node3]: + node.removeFinalizersSafetyFile() + + Print("Restart nodes") + for node in [node0, node1, node2, node3]: + node.relaunch(rmArgs=" --snapshot {}".format(node0.getLatestSnapshot())) + + Print("Verify LIB advances on all nodes") + for node in [node0, node1, node2, node3]: + assert node.waitForLibToAdvance(), "Node did not advance LIB after restart" + + for node in [node0, node1, node2, node3]: + nodeId = node.getBlock(currentLIB)["id"] + assert nodeId == libBlock["id"], "Node lib block id does not match prior lib block id" + + testSuccessful=True +finally: + TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails) + +exitCode = 0 if testSuccessful else 1 +exit(exitCode) From 02797aec8054cae83cd06c6064e115279f16bb74 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 16 May 2024 19:56:51 -0500 Subject: [PATCH 3/8] GH-13 Add disaster recovery scenario 3 --- tests/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e1376c6cb5..85d5bf697e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -71,6 +71,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/transition_to_if.py ${CMAKE_CURRENT_B configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery.py ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery_2.py ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery_2.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery_2_test_shape.json ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery_2_test_shape.json COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery_3.py ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery_3.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trx_finality_status_test.py ${CMAKE_CURRENT_BINARY_DIR}/trx_finality_status_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trx_finality_status_forked_test.py ${CMAKE_CURRENT_BINARY_DIR}/trx_finality_status_forked_test.py COPYONLY) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugin_http_api_test.py ${CMAKE_CURRENT_BINARY_DIR}/plugin_http_api_test.py COPYONLY) @@ -154,6 +155,8 @@ add_test(NAME disaster_recovery COMMAND tests/disaster_recovery.py -v ${UNSHARE} set_property(TEST disaster_recovery PROPERTY LABELS nonparallelizable_tests) add_test(NAME disaster_recovery_2 COMMAND tests/disaster_recovery_2.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST disaster_recovery_2 PROPERTY LABELS nonparallelizable_tests) +add_test(NAME disaster_recovery_3 COMMAND tests/disaster_recovery_3.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +set_property(TEST disaster_recovery_3 PROPERTY LABELS nonparallelizable_tests) add_test(NAME ship_reqs_across_svnn_test COMMAND tests/ship_reqs_across_svnn_test.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) set_property(TEST ship_reqs_across_svnn_test PROPERTY LABELS nonparallelizable_tests) From 82d22f6f73515ad666a88b2deaeb1bd858e087fe Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 17 May 2024 12:26:19 -0500 Subject: [PATCH 4/8] GH-13 Rename method to indicate it removes entire directory. Update test description. --- tests/TestHarness/Node.py | 2 +- tests/disaster_recovery_3.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/TestHarness/Node.py b/tests/TestHarness/Node.py index f548766f2f..d386121fba 100644 --- a/tests/TestHarness/Node.py +++ b/tests/TestHarness/Node.py @@ -563,7 +563,7 @@ def removeReversibleBlks(self): reversibleBlks = os.path.join(dataDir, "blocks", "reversible") shutil.rmtree(reversibleBlks, ignore_errors=True) - def removeFinalizersSafetyFile(self): + def removeFinalizersSafetyDir(self): dataDir = Utils.getNodeDataDir(self.nodeId) finalizersDir = os.path.join(dataDir, "finalizers") shutil.rmtree(finalizersDir, ignore_errors=True) diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py index 27f1e56129..9b3ba874e3 100755 --- a/tests/disaster_recovery_3.py +++ b/tests/disaster_recovery_3.py @@ -9,11 +9,11 @@ ############################################################### # disaster_recovery - Scenario 3 # -# Create integration test with 4 nodes (A, B, C, and D) which each have their own producer and finalizer. The finalizer -# policy consists of the four finalizers with a threshold of 3. The proposer policy involves all four proposers. +# Integration test with 4 nodes (A, B, C, and D), each having its own producer and finalizer. The finalizer policy +# consists of the four finalizers with a threshold of 3. The proposer policy involves all four proposers. # -# - At least two of the four nodes should have a LIB N and a finalizer safety information file that locks on a block -# after N. The other two nodes should have a LIB that is less than or equal to block N. +# - At least two of the four nodes have a LIB N and a finalizer safety information file that locks on a block +# after N. The other two nodes have a LIB that is less than or equal to block N. # # All nodes are shut down. The reversible blocks on all nodes is deleted. Restart all nodes from an earlier snapshot. # @@ -108,8 +108,9 @@ for node in [node0, node1, node2, node3]: assert not node.verifyAlive(), "Node did not shutdown" + Print("Remove finalizer safety data") for node in [node0, node1, node2, node3]: - node.removeFinalizersSafetyFile() + node.removeFinalizersSafetyDir() Print("Restart nodes") for node in [node0, node1, node2, node3]: From 86f0b4ff8f8b34f58cf618c6fae3787e830f5c66 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Fri, 17 May 2024 13:19:45 -0500 Subject: [PATCH 5/8] GH-13 Make sure LIB advances before shutdown --- tests/disaster_recovery_3.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py index 9b3ba874e3..0d4d91343e 100755 --- a/tests/disaster_recovery_3.py +++ b/tests/disaster_recovery_3.py @@ -82,10 +82,20 @@ libBlock = node0.getBlock(currentLIB) Print(f"Lib Block: {libBlock}") - Print("Shutdown all nodes") - for node in [node0, node1, node2, node3]: + Print("Shutdown two nodes") + for node in [node0, node1]: node.kill(signal.SIGTERM) - for node in [node0, node1, node2, node3]: + for node in [node0, node1]: + assert not node.verifyAlive(), "Node did not shutdown" + + Print("Wait for lib to advance on other nodes") + for node in [node2, node3]: + assert node.waitForBlock(currentLIB-1, timeout=None, blockType=BlockType.lib), "Node did not advance LIB after shutdown of node0 and node1" + + Print("Shutdown other two nodes") + for node in [node2, node3]: + node.kill(signal.SIGTERM) + for node in [node2, node3]: assert not node.verifyAlive(), "Node did not shutdown" Print("Remove reversible blocks and state, but not finalizers safety data") From 8aac65a468322222950a38beeef37ed302e9d535 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 20 May 2024 16:21:49 -0500 Subject: [PATCH 6/8] GH-13 Update to use n_LIB so test more closely matches description. Also update the wait on node2 and node3 to be on N. --- tests/disaster_recovery_3.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py index 0d4d91343e..3ad438ad12 100755 --- a/tests/disaster_recovery_3.py +++ b/tests/disaster_recovery_3.py @@ -79,8 +79,8 @@ assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot" currentLIB = node0.getIrreversibleBlockNum() - libBlock = node0.getBlock(currentLIB) - Print(f"Lib Block: {libBlock}") + n_LIB = currentLIB + 1 + libBlock = node0.getBlock(n_LIB) Print("Shutdown two nodes") for node in [node0, node1]: @@ -89,8 +89,8 @@ assert not node.verifyAlive(), "Node did not shutdown" Print("Wait for lib to advance on other nodes") - for node in [node2, node3]: - assert node.waitForBlock(currentLIB-1, timeout=None, blockType=BlockType.lib), "Node did not advance LIB after shutdown of node0 and node1" + for node in [node2, node3]: # waitForBlock uses > not >=. node2 & node3 have lib of n_LIB + assert node.waitForBlock(n_LIB-1, timeout=None, blockType=BlockType.lib), "Node did not advance LIB after shutdown of node0 and node1" Print("Shutdown other two nodes") for node in [node2, node3]: @@ -108,6 +108,13 @@ isRelaunchSuccess = cluster.getNode(i).relaunch(chainArg=" -e --snapshot {}".format(node0.getLatestSnapshot())) assert isRelaunchSuccess, f"node {i} relaunch from snapshot failed" + Print("Verify forks resolve and libBlock is included on all nodes") + Print(f"Lib Block: {libBlock}") + for node in [node0, node1, node2, node3]: + node.waitForBlock(n_LIB) + nodeId = node.getBlock(n_LIB)["id"] + assert nodeId == libBlock["id"], "Node lib block id does not match prior lib block id" + Print("Verify LIB does not advance on any node") for node in [node0, node1, node2, node3]: assert not node.waitForLibToAdvance(), "Node advanced LIB after relaunch when it should not" @@ -131,7 +138,7 @@ assert node.waitForLibToAdvance(), "Node did not advance LIB after restart" for node in [node0, node1, node2, node3]: - nodeId = node.getBlock(currentLIB)["id"] + nodeId = node.getBlock(n_LIB)["id"] assert nodeId == libBlock["id"], "Node lib block id does not match prior lib block id" testSuccessful=True From 15ec639929c55ee7a2e2c8184193c4d0daf4dd1d Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Tue, 21 May 2024 07:20:42 -0500 Subject: [PATCH 7/8] GH-13 Add more descriptive print statements --- tests/disaster_recovery_3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py index 3ad438ad12..c49fb3e0a7 100755 --- a/tests/disaster_recovery_3.py +++ b/tests/disaster_recovery_3.py @@ -82,13 +82,13 @@ n_LIB = currentLIB + 1 libBlock = node0.getBlock(n_LIB) - Print("Shutdown two nodes") + Print("Shutdown two nodes at LIB N-1, should be locked on block after N") for node in [node0, node1]: node.kill(signal.SIGTERM) for node in [node0, node1]: assert not node.verifyAlive(), "Node did not shutdown" - Print("Wait for lib to advance on other nodes") + Print("Wait for lib to advance to LIB N on other 2 nodes") for node in [node2, node3]: # waitForBlock uses > not >=. node2 & node3 have lib of n_LIB assert node.waitForBlock(n_LIB-1, timeout=None, blockType=BlockType.lib), "Node did not advance LIB after shutdown of node0 and node1" From f359b76cfd30950a62f000490a2ab0c970997a86 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Wed, 22 May 2024 13:42:41 -0500 Subject: [PATCH 8/8] GH-13 Improve test by pausing production on node0,node1 and verifying LIB before shutdown --- tests/disaster_recovery_3.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/disaster_recovery_3.py b/tests/disaster_recovery_3.py index c49fb3e0a7..bbf7c14221 100755 --- a/tests/disaster_recovery_3.py +++ b/tests/disaster_recovery_3.py @@ -54,8 +54,9 @@ Print(f'producing nodes: {pnodes}, delay between nodes launch: {delay} second{"s" if delay != 1 else ""}') Print("Stand up cluster") + extraNodeosArgs = " --plugin eosio::producer_api_plugin " if cluster.launch(pnodes=pnodes, totalNodes=total_nodes, totalProducers=pnodes, delay=delay, loadSystemContract=False, - activateIF=True, biosFinalizer=False) is False: + extraNodeosArgs=extraNodeosArgs, activateIF=True, biosFinalizer=False) is False: errorExit("Failed to stand up eos cluster.") assert cluster.biosNode.getInfo(exitOnError=True)["head_block_producer"] != "eosio", "launch should have waited for production to change" @@ -76,8 +77,14 @@ Print("Wait for snapshot node lib to advance") assert node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib), "Node0 did not advance to make snapshot block LIB" assert node1.waitForLibToAdvance(), "Node1 did not advance LIB after snapshot of Node0" - assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot" + + Print("Stop production on Node0 and Node1") + assert node0.waitForProducer("defproducera"), "Node 0 did not produce" + for node in [node0, node1]: + node.processUrllibRequest("producer", "pause", exitOnError=True) + time.sleep(0.5) + currentLIB = node0.getIrreversibleBlockNum() n_LIB = currentLIB + 1 libBlock = node0.getBlock(n_LIB) @@ -91,6 +98,8 @@ Print("Wait for lib to advance to LIB N on other 2 nodes") for node in [node2, node3]: # waitForBlock uses > not >=. node2 & node3 have lib of n_LIB assert node.waitForBlock(n_LIB-1, timeout=None, blockType=BlockType.lib), "Node did not advance LIB after shutdown of node0 and node1" + currentLIB = node.getIrreversibleBlockNum() + assert currentLIB == n_LIB, f"Node advanced LIB {currentLIB} beyond N LIB {n_LIB}" Print("Shutdown other two nodes") for node in [node2, node3]: