Skip to content

Commit

Permalink
test: add test for replication deadlock on replication timeout (#3691)
Browse files Browse the repository at this point in the history
* test: add test for replication deadlock on replication timeout

Signed-off-by: adi_holden <[email protected]>
  • Loading branch information
adiholden authored Sep 19, 2024
1 parent efa4efd commit 409c2a3
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/server/replica.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@ void Replica::MainReplicationFb() {
}

// Give a lower timeout for connect, because we're
reconnect_count_++;
ec = ConnectAndAuth(absl::GetFlag(FLAGS_master_reconnect_timeout_ms) * 1ms, &cntx_);
if (ec) {
reconnect_count_++;
LOG(WARNING) << "Error connecting to " << server().Description() << " " << ec;
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/dragonfly/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def format_args(args):
async def metrics(self):
session = aiohttp.ClientSession()
resp = await session.get(f"http://localhost:{self.port}/metrics")
data = await resp.text()
data = await resp.text(encoding="utf-8")
await session.close()
return {
metric_family.name: metric_family
Expand Down
39 changes: 39 additions & 0 deletions tests/dragonfly/replication_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2338,6 +2338,45 @@ async def test_announce_ip_port(df_factory):
assert port == "1337"


@pytest.mark.asyncio
async def test_replication_timeout_on_full_sync(df_factory: DflyInstanceFactory, df_seeder_factory):
# setting replication_timeout to a very small value to force the replica to timeout
master = df_factory.create(replication_timeout=100, vmodule="replica=2,dflycmd=2")
replica = df_factory.create()

df_factory.start_all([master, replica])

c_master = master.client()
c_replica = replica.client()

await c_master.execute_command("debug", "populate", "200000", "foo", "5000")
seeder = df_seeder_factory.create(port=master.port)
seeder_task = asyncio.create_task(seeder.run())

await asyncio.sleep(0.5) # wait for seeder running

await c_replica.execute_command(f"REPLICAOF localhost {master.port}")

# wait for full sync
async with async_timeout.timeout(3):
await wait_for_replicas_state(c_replica, state="full_sync", timeout=0.05)

await c_replica.execute_command(
"debug replica pause"
) # puase replica to trigger reconnect on master

await asyncio.sleep(1)

await c_replica.execute_command("debug replica resume") # resume replication

await asyncio.sleep(1) # replica will start resync
seeder.stop()
await seeder_task

await check_all_replicas_finished([c_replica], c_master)
await assert_replica_reconnections(replica, 0)


async def test_master_stalled_disconnect(df_factory: DflyInstanceFactory):
# disconnect after 1 second of being blocked
master = df_factory.create(replication_timeout=1000)
Expand Down

0 comments on commit 409c2a3

Please sign in to comment.