From 9053433afb2672d73b90a8dff1faefe688928236 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Wed, 6 Dec 2023 21:17:04 +0100 Subject: [PATCH] HDDS-9852. Intermittent timeout in testCorruptionDetected waiting for container to become unhealthy --- .../AbstractBackgroundContainerScanner.java | 49 ++++++++++++++----- .../container/ozoneimpl/OzoneContainer.java | 16 ++++++ ...groundContainerDataScannerIntegration.java | 6 ++- ...stContainerScannerIntegrationAbstract.java | 22 +++++++-- 4 files changed, 74 insertions(+), 19 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java index 139952d21235..0ba01a191f70 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java @@ -39,6 +39,7 @@ public abstract class AbstractBackgroundContainerScanner extends Thread { private final long dataScanInterval; private final AtomicBoolean stopping; + private final AtomicBoolean pausing = new AtomicBoolean(); public AbstractBackgroundContainerScanner(String name, long dataScanInterval) { @@ -69,30 +70,44 @@ public final void run() { @VisibleForTesting public final void runIteration() { + final boolean paused = pausing.get(); long startTime = System.nanoTime(); - scanContainers(); + if (!paused) { + scanContainers(); + } long totalDuration = System.nanoTime() - startTime; if (stopping.get()) { return; } - AbstractContainerScannerMetrics metrics = getMetrics(); - metrics.incNumScanIterations(); - LOG.info("Completed an iteration in {} minutes." + - " Number of iterations (since the data-node restart) : {}" + - ", Number of containers scanned in this iteration : {}" + - ", Number of unhealthy containers found in this iteration : {}", - TimeUnit.NANOSECONDS.toMinutes(totalDuration), - metrics.getNumScanIterations(), - metrics.getNumContainersScanned(), - metrics.getNumUnHealthyContainers()); + if (paused) { + LOG.debug("Skipped iteration due to pause"); + } else { + AbstractContainerScannerMetrics metrics = getMetrics(); + metrics.incNumScanIterations(); + LOG.info("Completed an iteration in {} minutes." + + " Number of iterations (since the data-node restart) : {}" + + ", Number of containers scanned in this iteration : {}" + + ", Number of unhealthy containers found in this iteration : {}", + TimeUnit.NANOSECONDS.toMinutes(totalDuration), + metrics.getNumScanIterations(), + metrics.getNumContainersScanned(), + metrics.getNumUnHealthyContainers()); + } long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(totalDuration); long remainingSleep = dataScanInterval - elapsedMillis; handleRemainingSleep(remainingSleep); } - public final void scanContainers() { + private void scanContainers() { Iterator> itr = getContainerIterator(); - while (!stopping.get() && itr.hasNext()) { + while (itr.hasNext()) { + final boolean stopped = stopping.get(); + final boolean paused = pausing.get(); + if (stopped || paused) { + LOG.info("{} exits scan loop stop={} pause={}", this, stopped, paused); + break; + } + Container c = itr.next(); try { scanContainer(c); @@ -139,6 +154,14 @@ public synchronized void shutdown() { } } + public void pause() { + pausing.getAndSet(true); + } + + public void unpause() { + pausing.getAndSet(false); + } + @VisibleForTesting public abstract AbstractContainerScannerMetrics getMetrics(); } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java index 1e34fb104939..560913fc4992 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java @@ -70,6 +70,7 @@ import java.time.Duration; import java.util.ArrayList; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.ThreadFactory; @@ -111,6 +112,7 @@ public class OzoneContainer { private final ContainerController controller; private BackgroundContainerMetadataScanner metadataScanner; private List dataScanners; + private List backgroundScanners; private final BlockDeletingService blockDeletingService; private final StaleRecoveringContainerScrubbingService recoveringContainerScrubbingService; @@ -338,8 +340,10 @@ private void startContainerScrub() { "the on-demand container scanner have been disabled."); return; } + initOnDemandContainerScanner(c); + backgroundScanners = new LinkedList<>(); // This config is for testing the scanners in isolation. if (c.isMetadataScanEnabled()) { initMetadataScanner(c); @@ -363,6 +367,7 @@ private void initContainerScanner(ContainerScannerConfiguration c) { new BackgroundContainerDataScanner(c, controller, (HddsVolume) v); s.start(); dataScanners.add(s); + backgroundScanners.add(s); } } @@ -370,6 +375,7 @@ private void initMetadataScanner(ContainerScannerConfiguration c) { if (this.metadataScanner == null) { this.metadataScanner = new BackgroundContainerMetadataScanner(c, controller); + backgroundScanners.add(metadataScanner); } this.metadataScanner.start(); } @@ -402,6 +408,16 @@ private void stopContainerScrub() { OnDemandContainerDataScanner.shutdown(); } + @VisibleForTesting + public void pauseContainerScrub() { + backgroundScanners.forEach(AbstractBackgroundContainerScanner::pause); + } + + @VisibleForTesting + public void resumeContainerScrub() { + backgroundScanners.forEach(AbstractBackgroundContainerScanner::unpause); + } + /** * Starts serving requests to ozone container. * diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java index 218c35c7d3f4..adc1234c2ed5 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java @@ -71,6 +71,8 @@ static void init() throws Exception { @EnumSource void testCorruptionDetected(ContainerCorruptions corruption) throws Exception { + pauseScanner(); + long containerID = writeDataThenCloseContainer(); // Container corruption has not yet been introduced. Container container = getDnContainer(containerID); @@ -78,10 +80,12 @@ void testCorruptionDetected(ContainerCorruptions corruption) corruption.applyTo(container); + resumeScanner(); + // Wait for the scanner to detect corruption. GenericTestUtils.waitFor( () -> container.getContainerState() == State.UNHEALTHY, - 500, 5000); + 500, 15_000); // Wait for SCM to get a report of the unhealthy replica. waitForScmToSeeUnhealthyReplica(containerID); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java index 53407dbf5701..a39a5c288cf7 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java @@ -116,6 +116,15 @@ public static void buildCluster(OzoneConfiguration ozoneConfig) bucket = volume.getBucket(bucketName); } + void pauseScanner() { + getOzoneContainer().pauseContainerScrub(); + } + + void resumeScanner() { + getOzoneContainer().resumeContainerScrub(); + } + + @AfterAll static void shutdown() throws IOException { if (ozClient != null) { @@ -143,11 +152,14 @@ protected void waitForScmToCloseContainer(long containerID) throws Exception { != HddsProtos.LifeCycleState.OPEN); } - protected Container getDnContainer(long containerID) { + private static OzoneContainer getOzoneContainer() { assertEquals(1, cluster.getHddsDatanodes().size()); HddsDatanodeService dn = cluster.getHddsDatanodes().get(0); - OzoneContainer oc = dn.getDatanodeStateMachine().getContainer(); - return oc.getContainerSet().getContainer(containerID); + return dn.getDatanodeStateMachine().getContainer(); + } + + protected Container getDnContainer(long containerID) { + return getOzoneContainer().getContainerSet().getContainer(containerID); } protected long writeDataThenCloseContainer() throws Exception { @@ -350,7 +362,7 @@ private static void corruptFile(File file) { RANDOM.nextBytes(corruptedBytes); try { Files.write(file.toPath(), corruptedBytes, - StandardOpenOption.TRUNCATE_EXISTING); + StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC); } catch (IOException ex) { // Fail the test. throw new UncheckedIOException(ex); @@ -363,7 +375,7 @@ private static void corruptFile(File file) { private static void truncateFile(File file) { try { Files.write(file.toPath(), new byte[]{}, - StandardOpenOption.TRUNCATE_EXISTING); + StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC); } catch (IOException ex) { // Fail the test. throw new UncheckedIOException(ex);