Skip to content

Commit

Permalink
HDDS-9852. Intermittent timeout in testCorruptionDetected waiting for…
Browse files Browse the repository at this point in the history
… container to become unhealthy
  • Loading branch information
adoroszlai committed Dec 6, 2023
1 parent c0f79c4 commit 9053433
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ public abstract class AbstractBackgroundContainerScanner extends Thread {
private final long dataScanInterval;

private final AtomicBoolean stopping;
private final AtomicBoolean pausing = new AtomicBoolean();

public AbstractBackgroundContainerScanner(String name,
long dataScanInterval) {
Expand Down Expand Up @@ -69,30 +70,44 @@ public final void run() {

@VisibleForTesting
public final void runIteration() {
final boolean paused = pausing.get();
long startTime = System.nanoTime();
scanContainers();
if (!paused) {
scanContainers();
}
long totalDuration = System.nanoTime() - startTime;
if (stopping.get()) {
return;
}
AbstractContainerScannerMetrics metrics = getMetrics();
metrics.incNumScanIterations();
LOG.info("Completed an iteration in {} minutes." +
" Number of iterations (since the data-node restart) : {}" +
", Number of containers scanned in this iteration : {}" +
", Number of unhealthy containers found in this iteration : {}",
TimeUnit.NANOSECONDS.toMinutes(totalDuration),
metrics.getNumScanIterations(),
metrics.getNumContainersScanned(),
metrics.getNumUnHealthyContainers());
if (paused) {
LOG.debug("Skipped iteration due to pause");
} else {
AbstractContainerScannerMetrics metrics = getMetrics();
metrics.incNumScanIterations();
LOG.info("Completed an iteration in {} minutes." +
" Number of iterations (since the data-node restart) : {}" +
", Number of containers scanned in this iteration : {}" +
", Number of unhealthy containers found in this iteration : {}",
TimeUnit.NANOSECONDS.toMinutes(totalDuration),
metrics.getNumScanIterations(),
metrics.getNumContainersScanned(),
metrics.getNumUnHealthyContainers());
}
long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(totalDuration);
long remainingSleep = dataScanInterval - elapsedMillis;
handleRemainingSleep(remainingSleep);
}

public final void scanContainers() {
private void scanContainers() {
Iterator<Container<?>> itr = getContainerIterator();
while (!stopping.get() && itr.hasNext()) {
while (itr.hasNext()) {
final boolean stopped = stopping.get();
final boolean paused = pausing.get();
if (stopped || paused) {
LOG.info("{} exits scan loop stop={} pause={}", this, stopped, paused);
break;
}

Container<?> c = itr.next();
try {
scanContainer(c);
Expand Down Expand Up @@ -139,6 +154,14 @@ public synchronized void shutdown() {
}
}

public void pause() {
pausing.getAndSet(true);
}

public void unpause() {
pausing.getAndSet(false);
}

@VisibleForTesting
public abstract AbstractContainerScannerMetrics getMetrics();
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import java.time.Duration;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadFactory;
Expand Down Expand Up @@ -111,6 +112,7 @@ public class OzoneContainer {
private final ContainerController controller;
private BackgroundContainerMetadataScanner metadataScanner;
private List<BackgroundContainerDataScanner> dataScanners;
private List<AbstractBackgroundContainerScanner> backgroundScanners;
private final BlockDeletingService blockDeletingService;
private final StaleRecoveringContainerScrubbingService
recoveringContainerScrubbingService;
Expand Down Expand Up @@ -338,8 +340,10 @@ private void startContainerScrub() {
"the on-demand container scanner have been disabled.");
return;
}

initOnDemandContainerScanner(c);

backgroundScanners = new LinkedList<>();
// This config is for testing the scanners in isolation.
if (c.isMetadataScanEnabled()) {
initMetadataScanner(c);
Expand All @@ -363,13 +367,15 @@ private void initContainerScanner(ContainerScannerConfiguration c) {
new BackgroundContainerDataScanner(c, controller, (HddsVolume) v);
s.start();
dataScanners.add(s);
backgroundScanners.add(s);
}
}

private void initMetadataScanner(ContainerScannerConfiguration c) {
if (this.metadataScanner == null) {
this.metadataScanner =
new BackgroundContainerMetadataScanner(c, controller);
backgroundScanners.add(metadataScanner);
}
this.metadataScanner.start();
}
Expand Down Expand Up @@ -402,6 +408,16 @@ private void stopContainerScrub() {
OnDemandContainerDataScanner.shutdown();
}

@VisibleForTesting
public void pauseContainerScrub() {
backgroundScanners.forEach(AbstractBackgroundContainerScanner::pause);
}

@VisibleForTesting
public void resumeContainerScrub() {
backgroundScanners.forEach(AbstractBackgroundContainerScanner::unpause);
}

/**
* Starts serving requests to ozone container.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,21 @@ static void init() throws Exception {
@EnumSource
void testCorruptionDetected(ContainerCorruptions corruption)
throws Exception {
pauseScanner();

long containerID = writeDataThenCloseContainer();
// Container corruption has not yet been introduced.
Container<?> container = getDnContainer(containerID);
assertEquals(State.CLOSED, container.getContainerState());

corruption.applyTo(container);

resumeScanner();

// Wait for the scanner to detect corruption.
GenericTestUtils.waitFor(
() -> container.getContainerState() == State.UNHEALTHY,
500, 5000);
500, 15_000);

// Wait for SCM to get a report of the unhealthy replica.
waitForScmToSeeUnhealthyReplica(containerID);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,15 @@ public static void buildCluster(OzoneConfiguration ozoneConfig)
bucket = volume.getBucket(bucketName);
}

void pauseScanner() {
getOzoneContainer().pauseContainerScrub();
}

void resumeScanner() {
getOzoneContainer().resumeContainerScrub();
}


@AfterAll
static void shutdown() throws IOException {
if (ozClient != null) {
Expand Down Expand Up @@ -143,11 +152,14 @@ protected void waitForScmToCloseContainer(long containerID) throws Exception {
!= HddsProtos.LifeCycleState.OPEN);
}

protected Container<?> getDnContainer(long containerID) {
private static OzoneContainer getOzoneContainer() {
assertEquals(1, cluster.getHddsDatanodes().size());
HddsDatanodeService dn = cluster.getHddsDatanodes().get(0);
OzoneContainer oc = dn.getDatanodeStateMachine().getContainer();
return oc.getContainerSet().getContainer(containerID);
return dn.getDatanodeStateMachine().getContainer();
}

protected Container<?> getDnContainer(long containerID) {
return getOzoneContainer().getContainerSet().getContainer(containerID);
}

protected long writeDataThenCloseContainer() throws Exception {
Expand Down Expand Up @@ -350,7 +362,7 @@ private static void corruptFile(File file) {
RANDOM.nextBytes(corruptedBytes);
try {
Files.write(file.toPath(), corruptedBytes,
StandardOpenOption.TRUNCATE_EXISTING);
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
} catch (IOException ex) {
// Fail the test.
throw new UncheckedIOException(ex);
Expand All @@ -363,7 +375,7 @@ private static void corruptFile(File file) {
private static void truncateFile(File file) {
try {
Files.write(file.toPath(), new byte[]{},
StandardOpenOption.TRUNCATE_EXISTING);
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
} catch (IOException ex) {
// Fail the test.
throw new UncheckedIOException(ex);
Expand Down

0 comments on commit 9053433

Please sign in to comment.