From 52f5026e5ea4690be70e2ad524492634dc21d5bd Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 23 Sep 2024 16:19:42 -0500 Subject: [PATCH 1/3] GH-789 Add verification to startup/catchup test that node does not sync too far ahead. Also increase number of blocks produced at start so that nodes are in lib catchup during sync. Verify all nodes are able to sync without a block log. --- tests/TestHarness/Node.py | 21 +++++++++++ tests/nodeos_startup_catchup.py | 67 ++++++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/tests/TestHarness/Node.py b/tests/TestHarness/Node.py index 0ccedfda0c..8cfab82bac 100644 --- a/tests/TestHarness/Node.py +++ b/tests/TestHarness/Node.py @@ -687,6 +687,27 @@ def findInLog(self, searchStr): return True return False + def linesInLog(self, searchStr): + dataDir=Utils.getNodeDataDir(self.nodeId) + files=Node.findStderrFiles(dataDir) + lines=[] + for file in files: + with open(file, 'r') as f: + for line in f: + if searchStr in line: + lines.append(line) + return lines + + def countInLog(self, searchStr) -> int: + dataDir=Utils.getNodeDataDir(self.nodeId) + files=Node.findStderrFiles(dataDir) + count = 0 + for file in files: + with open(file, 'r') as f: + contents = f.read() + count += contents.count(searchStr) + return count + # verify only one or two 'Starting block' per block number unless block is restarted def verifyStartingBlockMessages(self): dataDir=Utils.getNodeDataDir(self.nodeId) diff --git a/tests/nodeos_startup_catchup.py b/tests/nodeos_startup_catchup.py index feca64f4dc..669e1d7233 100755 --- a/tests/nodeos_startup_catchup.py +++ b/tests/nodeos_startup_catchup.py @@ -135,9 +135,9 @@ def waitForNodeStarted(node): transactionsPerBlock=targetTpsPerGenerator*trxGeneratorCnt*timePerBlock/1000 steadyStateWait=20 startBlockNum=blockNum+steadyStateWait - numBlocks=20 + numBlocks=400 endBlockNum=startBlockNum+numBlocks - waitForBlock(node0, endBlockNum) + waitForBlock(node0, endBlockNum, timeout=numBlocks) steadyStateWindowTrxs=0 steadyStateAvg=0 steadyStateWindowBlks=0 @@ -182,23 +182,18 @@ def waitForNodeStarted(node): Print("Shutdown catchup node and validate exit code") catchupNode.interruptAndVerifyExitStatus(60) - # every other catchup make a lib catchup - if catchup_num % 2 == 0: - Print(f"Wait for producer to advance lib past head of catchup {catchupHead}") - # catchupHead+5 to allow for advancement of head during shutdown of catchupNode - waitForBlock(node0, catchupHead+5, timeout=twoRoundsTimeout*2, blockType=BlockType.lib) + Print(f"Wait for producer to advance lib past head of catchup {catchupHead}") + # catchupHead+5 to allow for advancement of head during shutdown of catchupNode + waitForBlock(node0, catchupHead+5, timeout=twoRoundsTimeout*2, blockType=BlockType.lib) Print("Restart catchup node") - addSwapFlags = None - if catchup_num % 3 == 0: - addSwapFlags = {"--block-log-retain-blocks": "0", "--delete-all": ""} - catchupNode.relaunch(skipGenesis=False, addSwapFlags=addSwapFlags) + catchupNode.relaunch(skipGenesis=False) waitForNodeStarted(catchupNode) lastCatchupLibNum=lib(catchupNode) Print("Verify catchup node is advancing") # verify catchup node is advancing to producer - waitForBlock(catchupNode, lastCatchupLibNum+1, timeout=twoRoundsTimeout, blockType=BlockType.lib) + waitForBlock(catchupNode, lastLibNum, timeout=twoRoundsTimeout, blockType=BlockType.lib) Print("Verify producer is still advancing LIB") lastLibNum=lib(node0) @@ -209,20 +204,56 @@ def waitForNodeStarted(node): # verify catchup node is advancing to producer waitForBlock(catchupNode, lastLibNum, timeout=(numBlocksToCatchup/2 + 60), blockType=BlockType.lib) catchupNode.interruptAndVerifyExitStatus(60) + + Print("Verify catchup without a block log") + addSwapFlags = {"--block-log-retain-blocks": "0", "--delete-all": ""} + catchupNode.relaunch(skipGenesis=False, addSwapFlags=addSwapFlags) + waitForNodeStarted(catchupNode) + lastCatchupLibNum=lib(catchupNode) + + Print("Verify catchup node is advancing without block log") + # verify catchup node is advancing to producer + waitForBlock(catchupNode, lastLibNum+1, timeout=twoRoundsTimeout, blockType=BlockType.lib) + catchupNode.interruptAndVerifyExitStatus(60) catchupNode.popenProc=None - logFile = Utils.getNodeDataDir(catchupNodeNum) + "/stderr.txt" - f = open(logFile) - contents = f.read() + # Verify not syncing ahead of sync-fetch-span + sync_fetch_span = 1000 # default + irreversible = False + if catchupNode.nodeId in specificExtraNodeosArgs: + m = re.search(r"sync-fetch-span (\d+)", specificExtraNodeosArgs[catchupNode.nodeId]) + if m is not None: + sync_fetch_span = int(m.group(1)) + irreversible = re.search(r"irreversible", specificExtraNodeosArgs[catchupNode.nodeId]) is not None + Print(f"Verify request span for sync-fetch-span {sync_fetch_span} of {catchupNode.data_dir}") + lines = catchupNode.linesInLog("requesting range") + for line in lines: + m = re.search(r"requesting range (\d+) to (\d+), fhead (\d+), lib (\d+)", line) + if m is not None: + startBlockNum=int(m.group(1)) + endBlockNum=int(m.group(2)) + fhead=int(m.group(3)) + libNum=int(m.group(4)) + if endBlockNum-startBlockNum > sync_fetch_span: + errorExit(f"Requested range exceeds sync-fetch-span {sync_fetch_span}: {line}") + if irreversible: + # for now just use a larger tolerance, later when the logs include calculated lib this can be more precise + # See https://github.com/AntelopeIO/spring/issues/806 + if endBlockNum > fhead and fhead > libNum and endBlockNum - fhead > (sync_fetch_span*10): + errorExit(f"Requested range too far head of fork head {fhead} in irreversible mode, sync-fetch-span {sync_fetch_span}: {line}") + else: + if endBlockNum > fhead and fhead > libNum and endBlockNum - fhead > (sync_fetch_span*2-1): + errorExit(f"Requested range too far head of fork head {fhead} sync-fetch-span {sync_fetch_span}: {line}") + # See https://github.com/AntelopeIO/spring/issues/81 for fix to reduce the number of expected unlinkable blocks # Test verifies LIB is advancing, check to see that not too many unlinkable block exceptions are generated # while syncing up to head. - numUnlinkable = contents.count("3030001 unlinkable_block_exception: Unlinkable block") + numUnlinkable = catchupNode.countInLog("unlinkable_block") numUnlinkableAllowed = 500 - Print(f"Node{catchupNodeNum} has {numUnlinkable} unlinkable_block_exception in {logFile}") + Print(f"Node{catchupNodeNum} has {numUnlinkable} unlinkable_block in {catchupNode.data_dir}") if numUnlinkable > numUnlinkableAllowed: errorExit(f"Node{catchupNodeNum} has {numUnlinkable} which is more than the configured " - f"allowed {numUnlinkableAllowed} unlinkable blocks: {logFile}.") + f"allowed {numUnlinkableAllowed} unlinkable blocks: {catchupNode.data_dir}.") testSuccessful=True From 41df1c5388fc0028d07c2242afde3e5eeb854333 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Mon, 23 Sep 2024 16:21:11 -0500 Subject: [PATCH 2/3] GH-789 Do not allow sync ahead until blocks are in the forkdb to evaluate correctly. --- plugins/net_plugin/net_plugin.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index a77e19cc57..cf827c963a 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2192,13 +2192,15 @@ namespace eosio { controller& cc = my_impl->chain_plug->chain(); if (cc.get_read_mode() == db_read_mode::IRREVERSIBLE) { auto forkdb_head = cc.fork_db_head(); - auto calculated_lib = forkdb_head.irreversible_blocknum(); - auto num_blocks_that_can_be_applied = calculated_lib > head_num ? calculated_lib - head_num : 0; - if (num_blocks_that_can_be_applied < sync_fetch_span) { - if (head_num ) - fc_ilog(logger, "sync ahead allowed past sync-fetch-span ${sp}, block ${bn} for paused LIB ${l}, chain_lib ${cl}, forkdb size ${s}", - ("bn", blk_num)("sp", sync_fetch_span)("l", calculated_lib)("cl", head_num)("s", cc.fork_db_size())); - return true; + if (forkdb_head.block_num() >= blk_num) { + auto calculated_lib = forkdb_head.irreversible_blocknum(); + auto num_blocks_that_can_be_applied = calculated_lib > head_num ? calculated_lib - head_num : 0; + if (num_blocks_that_can_be_applied < sync_fetch_span) { + if (head_num ) + fc_ilog(logger, "sync ahead allowed past sync-fetch-span ${sp}, block ${bn} for paused LIB ${l}, chain_lib ${cl}, forkdb size ${s}", + ("bn", blk_num)("sp", sync_fetch_span)("l", calculated_lib)("cl", head_num)("s", cc.fork_db_size())); + return true; + } } } From 71a142e3aaf27ecd2644918fdd5db37ad27a5f1f Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Tue, 24 Sep 2024 06:55:24 -0500 Subject: [PATCH 3/3] GH-789 Use offset instead of absolute check --- plugins/net_plugin/net_plugin.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index cf827c963a..c39b8bbb73 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -2192,15 +2192,15 @@ namespace eosio { controller& cc = my_impl->chain_plug->chain(); if (cc.get_read_mode() == db_read_mode::IRREVERSIBLE) { auto forkdb_head = cc.fork_db_head(); - if (forkdb_head.block_num() >= blk_num) { - auto calculated_lib = forkdb_head.irreversible_blocknum(); - auto num_blocks_that_can_be_applied = calculated_lib > head_num ? calculated_lib - head_num : 0; - if (num_blocks_that_can_be_applied < sync_fetch_span) { - if (head_num ) - fc_ilog(logger, "sync ahead allowed past sync-fetch-span ${sp}, block ${bn} for paused LIB ${l}, chain_lib ${cl}, forkdb size ${s}", - ("bn", blk_num)("sp", sync_fetch_span)("l", calculated_lib)("cl", head_num)("s", cc.fork_db_size())); - return true; - } + auto calculated_lib = forkdb_head.irreversible_blocknum(); + auto num_blocks_that_can_be_applied = calculated_lib > head_num ? calculated_lib - head_num : 0; + // add blocks that can potentially be applied as they are not in the forkdb yet + num_blocks_that_can_be_applied += blk_num > forkdb_head.block_num() ? blk_num - forkdb_head.block_num() : 0; + if (num_blocks_that_can_be_applied < sync_fetch_span) { + if (head_num ) + fc_ilog(logger, "sync ahead allowed past sync-fetch-span ${sp}, block ${bn} for paused LIB ${l}, chain_lib ${cl}, forkdb size ${s}", + ("bn", blk_num)("sp", sync_fetch_span)("l", calculated_lib)("cl", head_num)("s", cc.fork_db_size())); + return true; } }