From 5eb0ba6d5d6a3fe91e9a67699c9febdb891d224a Mon Sep 17 00:00:00 2001
From: Owen O'Malley <omalley@apache.org>
Date: Sat, 1 Feb 2020 10:43:03 -0800
Subject: [PATCH 01/13] HADOOP-11867: Add gather API to file system.

 Conflicts:
	hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
	hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
	hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
	hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
	pom.xml
---
 hadoop-common-project/benchmark/pom.xml       |  91 ++++++
 .../benchmark/src/main/assembly/uber.xml      |  33 ++
 .../benchmark/src/main/findbugs/exclude.xml   |  25 ++
 .../hadoop/benchmark/AsyncBenchmark.java      | 242 ++++++++++++++
 hadoop-common-project/hadoop-common/pom.xml   |   8 +
 .../hadoop/fs/BufferedFSInputStream.java      |  27 +-
 .../apache/hadoop/fs/ChecksumFileSystem.java  | 204 +++++++++---
 .../apache/hadoop/fs/FSDataInputStream.java   |  22 +-
 .../java/org/apache/hadoop/fs/FileRange.java  |  54 ++++
 .../org/apache/hadoop/fs/FileRangeImpl.java   |  61 ++++
 .../apache/hadoop/fs/PositionedReadable.java  |  39 ++-
 .../apache/hadoop/fs/RawLocalFileSystem.java  | 103 +++++-
 .../hadoop/fs/impl/AsyncReaderUtils.java      | 217 +++++++++++++
 .../hadoop/fs/impl/CombinedFileRange.java     |  71 +++++
 .../hadoop/fs/impl/TestAsyncReaderUtils.java  | 301 ++++++++++++++++++
 hadoop-common-project/pom.xml                 |   8 +
 pom.xml                                       |   1 +
 17 files changed, 1454 insertions(+), 53 deletions(-)
 create mode 100644 hadoop-common-project/benchmark/pom.xml
 create mode 100644 hadoop-common-project/benchmark/src/main/assembly/uber.xml
 create mode 100644 hadoop-common-project/benchmark/src/main/findbugs/exclude.xml
 create mode 100644 hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRangeImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/CombinedFileRange.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java
diff --git a/hadoop-common-project/benchmark/pom.xml b/hadoop-common-project/benchmark/pom.xml
new file mode 100644
index 0000000000000..19999781503fc
--- /dev/null
+++ b/hadoop-common-project/benchmark/pom.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+                      https://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.hadoop</groupId>
+    <artifactId>hadoop-project</artifactId>
+    <version>3.4.0-SNAPSHOT</version>
+    <relativePath>../../hadoop-project</relativePath>
+  </parent>
+  <artifactId>hadoop-benchmark</artifactId>
+  <version>3.4.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+
+  <name>Apache Hadoop Common Benchmark</name>
+  <description>Apache Hadoop Common Benchmark</description>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.useIncrementalCompilation>false</maven.compiler.useIncrementalCompilation>
+    <jmh.version>1.20</jmh.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+      <version>${jmh.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-generator-annprocess</artifactId>
+      <version>${jmh.version}</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.hadoop.benchmark.AsyncBenchmark</mainClass>
+            </manifest>
+          </archive>
+          <descriptors>
+            <descriptor>src/main/assembly/uber.xml</descriptor>
+          </descriptors>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id> <!-- this is used for inheritance merges -->
+            <phase>package</phase> <!-- bind to the packaging phase -->
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>findbugs-maven-plugin</artifactId>
+          <configuration>
+            <excludeFilterFile>${basedir}/src/main/findbugs/exclude.xml</excludeFilterFile>
+          </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/hadoop-common-project/benchmark/src/main/assembly/uber.xml b/hadoop-common-project/benchmark/src/main/assembly/uber.xml
new file mode 100644
index 0000000000000..014eab951b3cf
--- /dev/null
+++ b/hadoop-common-project/benchmark/src/main/assembly/uber.xml
@@ -0,0 +1,33 @@
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<assembly>
+  <id>uber</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <outputDirectory>/</outputDirectory>
+      <useProjectArtifact>true</useProjectArtifact>
+      <unpack>true</unpack>
+      <scope>runtime</scope>
+    </dependencySet>
+  </dependencySets>
+  <containerDescriptorHandlers>
+    <containerDescriptorHandler>
+      <handlerName>metaInf-services</handlerName>
+    </containerDescriptorHandler>
+  </containerDescriptorHandlers>
+</assembly>
diff --git a/hadoop-common-project/benchmark/src/main/findbugs/exclude.xml b/hadoop-common-project/benchmark/src/main/findbugs/exclude.xml
new file mode 100644
index 0000000000000..f0b00b84b4d45
--- /dev/null
+++ b/hadoop-common-project/benchmark/src/main/findbugs/exclude.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<FindBugsFilter>
+  <Match>
+    <Bug pattern="EI_EXPOSE_REP,EI_EXPOSE_REP2"/>
+  </Match>
+  <Match>
+    <Class name="~org\.openjdk\.jmh\.infra\.generated.*"/>
+  </Match>
+  <Match>
+    <Class name="~org\.apache\.hadoop\.benchmark\.generated.*"/>
+  </Match>
+</FindBugsFilter>
diff --git a/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java b/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
new file mode 100644
index 0000000000000..a576acc18f7a9
--- /dev/null
+++ b/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.benchmark;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.io.EOFException;
+import java.io.IOException;
+
+import java.nio.ByteBuffer;
+import java.nio.channels.AsynchronousFileChannel;
+import java.nio.channels.CompletionHandler;
+import java.nio.file.FileSystems;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.function.IntFunction;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+public class AsyncBenchmark {
+
+  static final Path DATA_PATH = getTestDataPath();
+  static final String DATA_PATH_PROPERTY = "bench.data";
+
+  static Path getTestDataPath() {
+    String value = System.getProperty(DATA_PATH_PROPERTY);
+    return new Path(value == null ? "/tmp/taxi.orc" : value);
+  }
+
+  @State(Scope.Thread)
+  public static class FileSystemChoice {
+
+    @Param({"local", "raw"})
+    String fileSystemKind;
+
+    Configuration conf;
+    FileSystem fs;
+
+    @Setup(Level.Trial)
+    public void setup() {
+      conf = new Configuration();
+      try {
+        LocalFileSystem local = FileSystem.getLocal(conf);
+        fs = "raw".equals(fileSystemKind) ? local.getRaw() : local;
+      } catch (IOException e) {
+        throw new IllegalArgumentException("Can't get filesystem", e);
+      }
+    }
+  }
+
+  @State(Scope.Thread)
+  public static class BufferChoice {
+    @Param({"direct", "array"})
+    String bufferKind;
+
+    IntFunction<ByteBuffer> allocate;
+    @Setup(Level.Trial)
+    public void setup() {
+      allocate = "array".equals(bufferKind)
+                     ? ByteBuffer::allocate : ByteBuffer::allocateDirect;
+    }
+  }
+
+  @Benchmark
+  public void asyncRead(FileSystemChoice fsChoice,
+                        BufferChoice bufferChoice,
+                        Blackhole blackhole) throws Exception {
+    FSDataInputStream stream = fsChoice.fs.open(DATA_PATH);
+    List<FileRange> ranges = new ArrayList<>();
+    for(int m=0; m < 100; ++m) {
+      FileRangeImpl range = new FileRangeImpl(m * 1024L * 1024, 64 * 1024);
+      ranges.add(range);
+    }
+    stream.readAsync(ranges, bufferChoice.allocate);
+    for(FileRange range: ranges) {
+      blackhole.consume(range.getData().get());
+    }
+    stream.close();
+  }
+
+  static class Joiner implements CompletionHandler<ByteBuffer, FileRange> {
+    private int remaining;
+    private final ByteBuffer[] result;
+    private Throwable exception = null;
+
+    Joiner(int total) {
+      remaining = total;
+      result = new ByteBuffer[total];
+    }
+
+    synchronized void finish() {
+      remaining -= 1;
+      if (remaining == 0) {
+        notify();
+      }
+    }
+
+    synchronized ByteBuffer[] join() throws InterruptedException, IOException {
+      while (remaining > 0 && exception == null) {
+        wait();
+      }
+      if (exception != null) {
+        throw new IOException("problem reading", exception);
+      }
+      return result;
+    }
+
+
+    @Override
+    public synchronized void completed(ByteBuffer buffer, FileRange attachment) {
+      result[--remaining] = buffer;
+      if (remaining == 0) {
+        notify();
+      }
+    }
+
+    @Override
+    public synchronized void failed(Throwable exc, FileRange attachment) {
+      this.exception = exc;
+      notify();
+    }
+  }
+
+  static class FileRangeCallback extends FileRangeImpl implements CompletionHandler<Integer, FileRangeCallback> {
+    private final AsynchronousFileChannel channel;
+    private final ByteBuffer buffer;
+    private int completed = 0;
+    private final Joiner joiner;
+
+    FileRangeCallback(AsynchronousFileChannel channel, long offset,
+                      int length, Joiner joiner, ByteBuffer buffer) {
+      super(offset, length);
+      this.channel = channel;
+      this.joiner = joiner;
+      this.buffer = buffer;
+    }
+
+    @Override
+    public void completed(Integer result, FileRangeCallback attachment) {
+      final int bytes = result;
+      if (bytes == -1) {
+        failed(new EOFException("Read past end of file"), this);
+      }
+      completed += bytes;
+      if (completed < length) {
+        channel.read(buffer, offset + completed, this, this);
+      } else {
+        buffer.flip();
+        joiner.finish();
+      }
+    }
+
+    @Override
+    public void failed(Throwable exc, FileRangeCallback attachment) {
+       joiner.failed(exc, this);
+    }
+  }
+
+  @Benchmark
+  public void asyncFileChanArray(BufferChoice bufferChoice,
+                                 Blackhole blackhole) throws Exception {
+    java.nio.file.Path path = FileSystems.getDefault().getPath(DATA_PATH.toString());
+    AsynchronousFileChannel channel = AsynchronousFileChannel.open(path, StandardOpenOption.READ);
+    List<FileRangeImpl> ranges = new ArrayList<>();
+    Joiner joiner = new Joiner(100);
+    final int SIZE = 64 * 1024;
+    for(int m=0; m < 100; ++m) {
+      ByteBuffer buffer = bufferChoice.allocate.apply(SIZE);
+      FileRangeCallback range = new FileRangeCallback(channel, m * 1024L * 1024,
+          SIZE, joiner, buffer);
+      ranges.add(range);
+      channel.read(buffer, range.getOffset(), range, range);
+    }
+    joiner.join();
+    channel.close();
+    blackhole.consume(ranges);
+  }
+
+  @Benchmark
+  public void syncRead(FileSystemChoice fsChoice,
+                       Blackhole blackhole) throws Exception {
+    FSDataInputStream stream = fsChoice.fs.open(DATA_PATH);
+    List<byte[]> result = new ArrayList<>();
+    for(int m=0; m < 100; ++m) {
+      byte[] buffer = new byte[64 * 1024];
+      stream.readFully(m * 1024L * 1024, buffer);
+      result.add(buffer);
+    }
+    blackhole.consume(result);
+    stream.close();
+  }
+
+  /**
+   * Run the benchmarks.
+   * @param args the pathname of a 100MB data file
+   */
+  public static void main(String[] args) throws Exception {
+    OptionsBuilder opts = new OptionsBuilder();
+    opts.include("AsyncBenchmark");
+    opts.jvmArgs("-server", "-Xms256m", "-Xmx2g",
+        "-D" + DATA_PATH_PROPERTY + "=" + args[0]);
+    opts.forks(1);
+    new Runner(opts.build()).run();
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml
index bcba2288300f3..71fb2ce353003 100644
--- a/hadoop-common-project/hadoop-common/pom.xml
+++ b/hadoop-common-project/hadoop-common/pom.xml
@@ -651,6 +651,14 @@
           </filesets>
         </configuration>
       </plugin>
+        <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-compiler-plugin</artifactId>
+            <configuration>
+                <source>8</source>
+                <target>8</target>
+            </configuration>
+        </plugin>
     </plugins>
   </build>
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
index 59345f5d25caf..e958a6bebf5e8 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -22,6 +22,9 @@
 import java.io.FileDescriptor;
 import java.io.IOException;
 import java.util.StringJoiner;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.function.IntFunction;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -158,8 +161,24 @@ public IOStatistics getIOStatistics() {
   @Override
   public String toString() {
     return new StringJoiner(", ",
-        BufferedFSInputStream.class.getSimpleName() + "[", "]")
-        .add("in=" + in)
-        .toString();
+            BufferedFSInputStream.class.getSimpleName() + "[", "]")
+            .add("in=" + in)
+            .toString();
+  }
+
+  @Override
+  public int minimumReasonableSeek() {
+    return ((PositionedReadable) in).minimumReasonableSeek();
+  }
+
+  @Override
+  public int maximumReadSize() {
+    return ((PositionedReadable) in).maximumReadSize();
+  }
+
+  @Override
+  public void readAsync(List<? extends FileRange> ranges,
+                        IntFunction<ByteBuffer> allocate) {
+    ((PositionedReadable) in).readAsync(ranges, allocate);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index c7f8e36c3f675..1cc640a4c9e7d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -22,18 +22,26 @@
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
 import java.nio.channels.ClosedChannelException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.List;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.function.IntFunction;
+import java.util.zip.CRC32;
 
 import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
+import org.apache.hadoop.fs.impl.AsyncReaderUtils;
+import org.apache.hadoop.fs.impl.CombinedFileRange;
 import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
 import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
@@ -66,7 +74,7 @@ public abstract class ChecksumFileSystem extends FilterFileSystem {
   public static double getApproxChkSumLength(long size) {
     return ChecksumFSOutputSummer.CHKSUM_AS_FRACTION * size;
   }
-  
+
   public ChecksumFileSystem(FileSystem fs) {
     super(fs);
   }
@@ -82,7 +90,7 @@ public void setConf(Configuration conf) {
           bytesPerChecksum);
     }
   }
-  
+
   /**
    * Set whether to verify checksum.
    */
@@ -95,7 +103,7 @@ public void setVerifyChecksum(boolean verifyChecksum) {
   public void setWriteChecksum(boolean writeChecksum) {
     this.writeChecksum = writeChecksum;
   }
-  
+
   /** get the raw file system */
   @Override
   public FileSystem getRawFileSystem() {
@@ -113,7 +121,7 @@ public static boolean isChecksumFile(Path file) {
     return name.startsWith(".") && name.endsWith(".crc");
   }
 
-  /** Return the length of the checksum file given the size of the 
+  /** Return the length of the checksum file given the size of the
    * actual file.
    **/
   public long getChecksumFileLength(Path file, long fileSize) {
@@ -143,18 +151,18 @@ private static class ChecksumFSInputChecker extends FSInputChecker implements
     private ChecksumFileSystem fs;
     private FSDataInputStream datas;
     private FSDataInputStream sums;
-    
+
     private static final int HEADER_LENGTH = 8;
-    
+
     private int bytesPerSum = 1;
-    
+
     public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file)
       throws IOException {
       this(fs, file, fs.getConf().getInt(
-                       LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_KEY, 
+                       LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_KEY,
                        LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_DEFAULT));
     }
-    
+
     public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file, int bufferSize)
       throws IOException {
       super( file, fs.getFileStatus(file).getReplication() );
@@ -170,7 +178,8 @@ public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file, int bufferSize)
         if (!Arrays.equals(version, CHECKSUM_VERSION))
           throw new IOException("Not a checksum file: "+sumFile);
         this.bytesPerSum = sums.readInt();
-        set(fs.verifyChecksum, DataChecksum.newCrc32(), bytesPerSum, 4);
+        set(fs.verifyChecksum, DataChecksum.newCrc32(), bytesPerSum,
+            FSInputChecker.CHECKSUM_SIZE);
       } catch (IOException e) {
         // mincing the message is terrible, but java throws permission
         // exceptions as FNF because that's all the method signatures allow!
@@ -182,21 +191,21 @@ public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file, int bufferSize)
         set(fs.verifyChecksum, null, 1, 0);
       }
     }
-    
+
     private long getChecksumFilePos( long dataPos ) {
-      return HEADER_LENGTH + 4*(dataPos/bytesPerSum);
+      return HEADER_LENGTH + FSInputChecker.CHECKSUM_SIZE*(dataPos/bytesPerSum);
     }
-    
+
     @Override
     protected long getChunkPosition( long dataPos ) {
       return dataPos/bytesPerSum*bytesPerSum;
     }
-    
+
     @Override
     public int available() throws IOException {
       return datas.available() + super.available();
     }
-    
+
     @Override
     public int read(long position, byte[] b, int off, int len)
       throws IOException {
@@ -214,7 +223,7 @@ public int read(long position, byte[] b, int off, int len)
       }
       return nread;
     }
-    
+
     @Override
     public void close() throws IOException {
       datas.close();
@@ -223,7 +232,7 @@ public void close() throws IOException {
       }
       set(fs.verifyChecksum, null, 1, 0);
     }
-    
+
 
     @Override
     public boolean seekToNewSource(long targetPos) throws IOException {
@@ -246,7 +255,7 @@ protected int readChunk(long pos, byte[] buf, int offset, int len,
         final int checksumsToRead = Math.min(
           len/bytesPerSum, // number of checksums based on len to read
           checksum.length / CHECKSUM_SIZE); // size of checksum buffer
-        long checksumPos = getChecksumFilePos(pos); 
+        long checksumPos = getChecksumFilePos(pos);
         if(checksumPos != sums.getPos()) {
           sums.seek(checksumPos);
         }
@@ -286,8 +295,123 @@ protected int readChunk(long pos, byte[] buf, int offset, int len,
     public IOStatistics getIOStatistics() {
       return IOStatisticsSupport.retrieveIOStatistics(datas);
     }
+
+    public static long findChecksumOffset(long dataOffset,
+                                          int bytesPerSum) {
+      return HEADER_LENGTH + (dataOffset/bytesPerSum) * FSInputChecker.CHECKSUM_SIZE;
+    }
+
+    /**
+     * Find the checksum ranges that correspond to the given data ranges.
+     * @param dataRanges the input data ranges, which are assumed to be sorted
+     *                   and non-overlapping
+     * @return a list of AsyncReaderUtils.CombinedFileRange that correspond to
+     *         the checksum ranges
+     */
+    public static List<CombinedFileRange> findChecksumRanges(
+        List<? extends FileRange> dataRanges,
+        int bytesPerSum,
+        int minSeek,
+        int maxSize) {
+      List<CombinedFileRange> result = new ArrayList<>();
+      CombinedFileRange currentCrc = null;
+      for(FileRange range: dataRanges) {
+        long crcOffset = findChecksumOffset(range.getOffset(), bytesPerSum);
+        long crcEnd = findChecksumOffset(range.getOffset() + range.getLength() +
+                                             bytesPerSum - 1, bytesPerSum);
+        if (currentCrc == null ||
+                !currentCrc.merge(crcOffset, crcEnd, range, minSeek, maxSize)) {
+          currentCrc = new CombinedFileRange(crcOffset, crcEnd, range);
+          result.add(currentCrc);
+        }
+      }
+      return result;
+    }
+
+    /**
+     * Check the data against the checksums.
+     * @param sumsBytes the checksum data
+     * @param sumsOffset where from the checksum file this buffer started
+     * @param data the file data
+     * @param dataOffset where the file data started (must be a multiple of
+     *                  bytesPerSum)
+     * @param bytesPerSum how many bytes per a checksum
+     * @param file the path of the filename
+     * @return the data buffer
+     * @throws CompletionException if the checksums don't match
+     */
+    static ByteBuffer checkBytes(ByteBuffer sumsBytes,
+                                 long sumsOffset,
+                                 ByteBuffer data,
+                                 long dataOffset,
+                                 int bytesPerSum,
+                                 Path file) {
+      // determine how many bytes we need to skip at the start of the sums
+      int offset =
+          (int) (findChecksumOffset(dataOffset, bytesPerSum) - sumsOffset);
+      IntBuffer sums = sumsBytes.asIntBuffer();
+      sums.position(offset / FSInputChecker.CHECKSUM_SIZE);
+      ByteBuffer current = data.duplicate();
+      int NUM_CHUNKS = data.remaining() / bytesPerSum;
+      CRC32 crc = new CRC32();
+      // check each chunk to ensure they match
+      for(int c = 0; c < NUM_CHUNKS; ++c) {
+        // set the buffer position and the limit
+        current.limit((c + 1) * bytesPerSum);
+        current.position(c * bytesPerSum);
+        // compute the crc
+        crc.reset();
+        crc.update(current);
+        int expected = sums.get();
+        int calculated = (int) crc.getValue();
+
+        if (calculated != expected) {
+          // cast of c added to silence findbugs
+          long errPosn = dataOffset + (long) c * bytesPerSum;
+          throw new CompletionException(new ChecksumException(
+              "Checksum error: " + file + " at " + errPosn +
+                  " exp: " + expected + " got: " + calculated, errPosn));
+        }
+      }
+      // if everything matches, we return the data
+      return data;
+    }
+
+    @Override
+    public void readAsync(List<? extends FileRange> ranges,
+                          IntFunction<ByteBuffer> allocate) {
+      // If the stream doesn't have checksums, just delegate.
+      if (sums == null) {
+        datas.readAsync(ranges, allocate);
+        return;
+      }
+      int minSeek = minimumReasonableSeek();
+      int maxSize = maximumReadSize();
+      List<CombinedFileRange> dataRanges =
+          AsyncReaderUtils.sortAndMergeRanges(ranges, bytesPerSum,
+              minSeek, maximumReadSize());
+      List<CombinedFileRange> checksumRanges = findChecksumRanges(dataRanges,
+          bytesPerSum, minSeek, maxSize);
+      sums.readAsync(checksumRanges, allocate);
+      datas.readAsync(dataRanges, allocate);
+      for(CombinedFileRange checksumRange: checksumRanges) {
+        for(FileRange dataRange: checksumRange.getUnderlying()) {
+          // when we have both the ranges, validate the checksum
+          CompletableFuture<ByteBuffer> result =
+              checksumRange.getData().thenCombineAsync(dataRange.getData(),
+                  (sumBuffer, dataBuffer) ->
+                      checkBytes(sumBuffer, checksumRange.getOffset(),
+                          dataBuffer, dataRange.getOffset(), bytesPerSum, file));
+          // Now, slice the read data range to the user's ranges
+          for(FileRange original: ((CombinedFileRange) dataRange).getUnderlying()) {
+            original.setData(result.thenApply(
+                (b) -> AsyncReaderUtils.sliceTo(b, dataRange.getOffset(), original)));
+          }
+        }
+      }
+    }
   }
-  
+
   private static class FSDataBoundedInputStream extends FSDataInputStream {
     private FileSystem fs;
     private Path file;
@@ -298,12 +422,12 @@ private static class FSDataBoundedInputStream extends FSDataInputStream {
       this.fs = fs;
       this.file = file;
     }
-    
+
     @Override
     public boolean markSupported() {
       return false;
     }
-    
+
     /* Return the file length */
     private long getFileLength() throws IOException {
       if( fileLen==-1L ) {
@@ -311,7 +435,7 @@ private long getFileLength() throws IOException {
       }
       return fileLen;
     }
-    
+
     /**
      * Skips over and discards <code>n</code> bytes of data from the
      * input stream.
@@ -335,11 +459,11 @@ public synchronized long skip(long n) throws IOException {
       }
       return super.skip(n);
     }
-    
+
     /**
      * Seek to the given position in the stream.
      * The next read() will be from that position.
-     * 
+     *
      * <p>This method does not allow seek past the end of the file.
      * This produces IOException.
      *
@@ -404,22 +528,22 @@ public void concat(final Path f, final Path[] psrcs) throws IOException {
    */
   public static long getChecksumLength(long size, int bytesPerSum) {
     //the checksum length is equal to size passed divided by bytesPerSum +
-    //bytes written in the beginning of the checksum file.  
-    return ((size + bytesPerSum - 1) / bytesPerSum) * 4 +
-             CHECKSUM_VERSION.length + 4;  
+    //bytes written in the beginning of the checksum file.
+    return ((size + bytesPerSum - 1) / bytesPerSum) * FSInputChecker.CHECKSUM_SIZE +
+             ChecksumFSInputChecker.HEADER_LENGTH;
   }
 
   /** This class provides an output stream for a checksummed file.
    * It generates checksums for data. */
   private static class ChecksumFSOutputSummer extends FSOutputSummer
       implements IOStatisticsSource, StreamCapabilities {
-    private FSDataOutputStream datas;    
+    private FSDataOutputStream datas;
     private FSDataOutputStream sums;
     private static final float CHKSUM_AS_FRACTION = 0.01f;
     private boolean isClosed = false;
-    
-    public ChecksumFSOutputSummer(ChecksumFileSystem fs, 
-                          Path file, 
+
+    public ChecksumFSOutputSummer(ChecksumFileSystem fs,
+                          Path file,
                           boolean overwrite,
                           int bufferSize,
                           short replication,
@@ -440,7 +564,7 @@ public ChecksumFSOutputSummer(ChecksumFileSystem fs,
       sums.write(CHECKSUM_VERSION, 0, CHECKSUM_VERSION.length);
       sums.writeInt(bytesPerSum);
     }
-    
+
     @Override
     public void close() throws IOException {
       try {
@@ -451,7 +575,7 @@ public void close() throws IOException {
         isClosed = true;
       }
     }
-    
+
     @Override
     protected void writeChunk(byte[] b, int offset, int len, byte[] checksum,
         int ckoff, int cklen)
@@ -707,7 +831,7 @@ public boolean rename(Path src, Path dst) throws IOException {
         value = fs.rename(srcCheckFile, dstCheckFile);
       } else if (fs.exists(dstCheckFile)) {
         // no src checksum, so remove dst checksum
-        value = fs.delete(dstCheckFile, true); 
+        value = fs.delete(dstCheckFile, true);
       }
 
       return value;
@@ -739,7 +863,7 @@ public boolean delete(Path f, boolean recursive) throws IOException{
       return fs.delete(f, true);
     }
   }
-    
+
   final private static PathFilter DEFAULT_FILTER = new PathFilter() {
     @Override
     public boolean accept(Path file) {
@@ -750,7 +874,7 @@ public boolean accept(Path file) {
   /**
    * List the statuses of the files/directories in the given path if the path is
    * a directory.
-   * 
+   *
    * @param f
    *          given path
    * @return the statuses of the files/directories in the given path
@@ -771,7 +895,7 @@ public RemoteIterator<FileStatus> listStatusIterator(final Path p)
   /**
    * List the statuses of the files/directories in the given path if the path is
    * a directory.
-   * 
+   *
    * @param f
    *          given path
    * @return the statuses of the files/directories in the given patch
@@ -782,7 +906,7 @@ public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
   throws IOException {
     return fs.listLocatedStatus(f, DEFAULT_FILTER);
   }
-  
+
   @Override
   public boolean mkdirs(Path f) throws IOException {
     return fs.mkdirs(f);
@@ -832,7 +956,7 @@ public void copyToLocalFile(Path src, Path dst, boolean copyCrc)
     } else {
       FileStatus[] srcs = listStatus(src);
       for (FileStatus srcFile : srcs) {
-        copyToLocalFile(srcFile.getPath(), 
+        copyToLocalFile(srcFile.getPath(),
                         new Path(dst, srcFile.getPath().getName()), copyCrc);
       }
     }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index b143a4cb63d19..8b704e385a639 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -1,4 +1,4 @@
-/**
+/*
  * 
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -26,6 +26,8 @@
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.util.EnumSet;
+import java.util.List;
+import java.util.function.IntFunction;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -51,7 +53,7 @@ public class FSDataInputStream extends DataInputStream
    */
   private final IdentityHashStore<ByteBuffer, ByteBufferPool>
     extendedReadBuffers
-      = new IdentityHashStore<ByteBuffer, ByteBufferPool>(0);
+      = new IdentityHashStore<>(0);
 
   public FSDataInputStream(InputStream in) {
     super(in);
@@ -279,4 +281,20 @@ public void readFully(long position, ByteBuffer buf) throws IOException {
   public IOStatistics getIOStatistics() {
     return IOStatisticsSupport.retrieveIOStatistics(in);
   }
+
+  @Override
+  public int minimumReasonableSeek() {
+    return ((PositionedReadable) in).minimumReasonableSeek();
+  }
+
+  @Override
+  public int maximumReadSize() {
+    return ((PositionedReadable) in).maximumReadSize();
+  }
+
+  @Override
+  public void readAsync(List<? extends FileRange> ranges,
+                        IntFunction<ByteBuffer> allocate) {
+    ((PositionedReadable) in).readAsync(ranges, allocate);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
new file mode 100644
index 0000000000000..6490f1599b76f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.CompletableFuture;
+
+/**
+ * A byte range of a file.
+ * This is used for the asynchronous gather read API of
+ * {@link PositionedReadable#readAsync}.
+ */
+public interface FileRange {
+  /**
+   * Get the starting offset of the range
+   * @return the byte offset of the start
+   */
+  long getOffset();
+
+  /**
+   * Get the length of the range.
+   * @return the number of bytes in the range.
+   */
+  int getLength();
+
+  /**
+   * Get the future data for this range.
+   * @return the future for the {@link ByteBuffer} that contains the data
+   */
+  CompletableFuture<ByteBuffer> getData();
+
+  /**
+   * Set a future for this range's data.
+   * This method is called by {@link PositionedReadable#readAsync} to store the
+   * data for the user to pick up later via {@link #getData}.
+   * @param data the future of the ByteBuffer that will have the data
+   */
+  void setData(CompletableFuture<ByteBuffer> data);
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRangeImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRangeImpl.java
new file mode 100644
index 0000000000000..a844dfc574442
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRangeImpl.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.CompletableFuture;
+
+/**
+ * A range of bytes from a file with an optional buffer to read those bytes
+ * for zero copy.
+ */
+public class FileRangeImpl implements FileRange {
+  protected long offset;
+  protected int length;
+  private CompletableFuture<ByteBuffer> reader;
+
+  public FileRangeImpl(long offset, int length) {
+    this.offset = offset;
+    this.length = length;
+  }
+
+  @Override
+  public String toString() {
+    return "range[" + offset + "," + (offset + length) + ")";
+  }
+
+  @Override
+  public long getOffset() {
+    return offset;
+  }
+
+  @Override
+  public int getLength() {
+    return length;
+  }
+
+  @Override
+  public void setData(CompletableFuture<ByteBuffer> reader) {
+    this.reader = reader;
+  }
+
+  @Override
+  public CompletableFuture<ByteBuffer> getData() {
+    return reader;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
index 6744d17a72666..2e5afad0c0792 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -18,9 +18,13 @@
 package org.apache.hadoop.fs;
 
 import java.io.*;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.function.IntFunction;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.impl.AsyncReaderUtils;
 
 /**
  * Stream that permits positional reading.
@@ -85,4 +89,37 @@ void readFully(long position, byte[] buffer, int offset, int length)
    * the read operation completed
    */
   void readFully(long position, byte[] buffer) throws IOException;
+
+  /**
+   * What is the smallest reasonable seek?
+   * @return the minimum number of bytes
+   */
+  default int minimumReasonableSeek() {
+    return 4 * 1024;
+  }
+
+  /**
+   * What is the largest size that we should group ranges together as?
+   * @return the number of bytes to read at once
+   */
+  default int maximumReadSize() {
+    return 1024 * 1024;
+  }
+
+  /**
+   * Read fully a list of file ranges asynchronously from this file.
+   * The default iterates through the ranges to read each synchronously, but
+   * the intent is that FSDataInputStream subclasses can make more efficient
+   * readers.
+   * As a result of the call, each range will have FileRange.setData(CompletableFuture)
+   * called with a future that when complete will have a ByteBuffer with the
+   * data from the file's range.
+   * @param ranges the byte ranges to read
+   * @param allocate the function to allocate ByteBuffer
+   */
+  default void readAsync(List<? extends FileRange> ranges,
+                         IntFunction<ByteBuffer> allocate) {
+    AsyncReaderUtils.readAsync(this, ranges, allocate,  minimumReasonableSeek(),
+        maximumReadSize());
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
index edcc4a8b99e77..1b7538db6d580 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -33,8 +33,11 @@
 import java.io.FileDescriptor;
 import java.net.URI;
 import java.nio.ByteBuffer;
+import java.nio.channels.AsynchronousFileChannel;
+import java.nio.channels.CompletionHandler;
 import java.nio.file.Files;
 import java.nio.file.NoSuchFileException;
+import java.nio.file.StandardOpenOption;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.nio.file.attribute.BasicFileAttributeView;
 import java.nio.file.attribute.FileTime;
@@ -44,6 +47,9 @@
 import java.util.Optional;
 import java.util.StringTokenizer;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.IntFunction;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -125,7 +131,9 @@ public void initialize(URI uri, Configuration conf) throws IOException {
   class LocalFSFileInputStream extends FSInputStream implements
       HasFileDescriptor, IOStatisticsSource, StreamCapabilities {
     private FileInputStream fis;
+    private final File name;
     private long position;
+    private AsynchronousFileChannel asyncChannel = null;
 
     /**
      * Minimal set of counters.
@@ -143,7 +151,8 @@ class LocalFSFileInputStream extends FSInputStream implements
     private final AtomicLong bytesRead;
 
     public LocalFSFileInputStream(Path f) throws IOException {
-      fis = new FileInputStream(pathToFile(f));
+      name = pathToFile(f);
+      fis = new FileInputStream(name);
       bytesRead = ioStatistics.getCounterReference(
           STREAM_READ_BYTES);
     }
@@ -174,10 +183,16 @@ public boolean seekToNewSource(long targetPos) throws IOException {
     @Override
     public int available() throws IOException { return fis.available(); }
     @Override
-    public void close() throws IOException { fis.close(); }
-    @Override
     public boolean markSupported() { return false; }
-    
+
+    @Override
+    public void close() throws IOException {
+      fis.close();
+      if (asyncChannel != null) {
+        asyncChannel.close();
+      }
+    }
+
     @Override
     public int read() throws IOException {
       try {
@@ -267,8 +282,84 @@ public boolean hasCapability(String capability) {
     public IOStatistics getIOStatistics() {
       return ioStatistics;
     }
+
+    // QQ: Should we make this synchronized?
+    AsynchronousFileChannel getAsyncChannel() throws IOException {
+      if (asyncChannel == null) {
+        asyncChannel = AsynchronousFileChannel.open(name.toPath(),
+            StandardOpenOption.READ);
+      }
+      return asyncChannel;
+    }
+
+    @Override
+    public void readAsync(List<? extends FileRange> ranges,
+                          IntFunction<ByteBuffer> allocate) {
+      // Set up all of the futures, so that we can use them if things fail
+      for(FileRange range: ranges) {
+        range.setData(new CompletableFuture<>());
+      }
+      try {
+        AsynchronousFileChannel channel = getAsyncChannel();
+        ByteBuffer[] buffers = new ByteBuffer[ranges.size()];
+        AsyncHandler asyncHandler = new AsyncHandler(channel, ranges, buffers);
+        for(int i = 0; i < ranges.size(); ++i) {
+          FileRange range = ranges.get(i);
+          buffers[i] = allocate.apply(range.getLength());
+          channel.read(buffers[i], range.getOffset(), i, asyncHandler);
+        }
+      } catch (IOException ioe) {
+        LOG.info("Can't get async channel", ioe);
+        for(FileRange range: ranges) {
+          range.getData().completeExceptionally(ioe);
+        }
+      }
+    }
   }
-  
+
+  /**
+   * A CompletionHandler that implements readFully and translates back
+   * into the form of CompletionHandler that our users expect.
+   */
+  static class AsyncHandler implements CompletionHandler<Integer, Integer> {
+    private final AsynchronousFileChannel channel;
+    private final List<? extends FileRange> ranges;
+    private final ByteBuffer[] buffers;
+
+    AsyncHandler(AsynchronousFileChannel channel,
+                 List<? extends FileRange> ranges,
+                 ByteBuffer[] buffers) {
+      this.channel = channel;
+      this.ranges = ranges;
+      this.buffers = buffers;
+    }
+
+    @Override
+    public void completed(Integer result, Integer r) {
+      FileRange range = ranges.get(r);
+      ByteBuffer buffer = buffers[r];
+      if (result == -1) {
+        failed(new EOFException("Read past End of File"), r);
+      } else {
+        if (buffer.remaining() > 0) {
+          // issue a read for the rest of the buffer
+          // QQ: What if this fails? It has the same handler.
+          channel.read(buffer, range.getOffset() + buffer.position(), r, this);
+        } else {
+          // QQ: Why  is this required? I think because we don't want the
+          // user to read data beyond limit.
+          buffer.flip();
+          range.getData().complete(buffer);
+        }
+      }
+    }
+
+    @Override
+    public void failed(Throwable exc, Integer r) {
+      ranges.get(r).getData().completeExceptionally(exc);
+    }
+  }
+
   @Override
   public FSDataInputStream open(Path f, int bufferSize) throws IOException {
     getFileStatus(f);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
new file mode 100644
index 0000000000000..39ba2803ecca5
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.IntFunction;
+
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.PositionedReadable;
+
+public class AsyncReaderUtils {
+  /**
+   * Read fully a list of file ranges asynchronously from this file.
+   * The default iterates through the ranges to read each synchronously, but
+   * the intent is that subclasses can make more efficient readers.
+   * The data or exceptions are pushed into {@link FileRange#getData()}.
+   * @param stream the stream to read the data from
+   * @param ranges the byte ranges to read
+   * @param allocate the byte buffer allocation
+   * @param minimumSeek the minimum number of bytes to seek over
+   * @param maximumRead the largest number of bytes to combine into a single read
+   */
+  public static void readAsync(PositionedReadable stream,
+                               List<? extends FileRange> ranges,
+                               IntFunction<ByteBuffer> allocate,
+                               int minimumSeek,
+                               int maximumRead) {
+    if (isOrderedDisjoint(ranges, 1, minimumSeek)) {
+      for(FileRange range: ranges) {
+        range.setData(readRangeFrom(stream, range, allocate));
+      }
+    } else {
+      for(CombinedFileRange range: sortAndMergeRanges(ranges, 1, minimumSeek,
+          maximumRead)) {
+        CompletableFuture<ByteBuffer> read =
+            readRangeFrom(stream, range, allocate);
+        for(FileRange child: range.getUnderlying()) {
+          child.setData(read.thenApply(
+              (b) -> sliceTo(b, range.getOffset(), child)));
+        }
+      }
+    }
+  }
+
+  /**
+   * Synchronously reads a range from the stream dealing with the combinations
+   * of ByteBuffers buffers and PositionedReadable streams.
+   * @param stream the stream to read from
+   * @param range the range to read
+   * @param allocate the function to allocate ByteBuffers
+   * @return the CompletableFuture that contains the read data
+   */
+  public static CompletableFuture<ByteBuffer> readRangeFrom(PositionedReadable stream,
+                                                            FileRange range,
+                                                            IntFunction<ByteBuffer> allocate) {
+    CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
+    try {
+      ByteBuffer buffer = allocate.apply(range.getLength());
+      if (stream instanceof ByteBufferPositionedReadable) {
+        ((ByteBufferPositionedReadable) stream).readFully(range.getOffset(),
+            buffer);
+        buffer.flip();
+      } else {
+        if (buffer.isDirect()) {
+          // if we need to read data from a direct buffer and the stream doesn't
+          // support it, we allocate a byte array to use.
+          byte[] tmp = new byte[range.getLength()];
+          stream.readFully(range.getOffset(), tmp, 0, tmp.length);
+          buffer.put(tmp);
+          buffer.flip();
+        } else {
+          stream.readFully(range.getOffset(), buffer.array(),
+              buffer.arrayOffset(), range.getLength());
+        }
+      }
+      result.complete(buffer);
+    } catch (IOException ioe) {
+      result.completeExceptionally(ioe);
+    }
+    return result;
+  }
+
+  /**
+   * Is the given input list:
+   * <ul>
+   *   <li>already sorted by offset</li>
+   *   <li>each range is more than minimumSeek apart</li>
+   *   <li>the start and end of each range is a multiple of chunkSize</li>
+   * </ul>
+   *
+   * @param input the list of input ranges
+   * @param chunkSize the size of the chunks that the offset & end must align to
+   * @param minimumSeek the minimum distance between ranges
+   * @return true if we can use the input list as is
+   */
+  public static boolean isOrderedDisjoint(List<? extends FileRange> input,
+                                          int chunkSize,
+                                          int minimumSeek) {
+    long previous = -minimumSeek;
+    for(FileRange range: input) {
+      long offset = range.getOffset();
+      long end = range.getOffset() + range.getLength();
+      if (offset % chunkSize != 0 ||
+              end % chunkSize != 0 ||
+              (offset - previous < minimumSeek)) {
+        return false;
+      }
+      previous = end;
+    }
+    return true;
+  }
+
+  public static long roundDown(long offset, int chunkSize) {
+    if (chunkSize > 1) {
+      return offset - (offset % chunkSize);
+    } else {
+      return offset;
+    }
+  }
+
+  public static long roundUp(long offset, int chunkSize) {
+    if (chunkSize > 1) {
+      long next = offset + chunkSize - 1;
+      return next - (next % chunkSize);
+    } else {
+      return offset;
+    }
+  }
+
+  /**
+   * Sort and merge ranges to optimize the access from the underlying file
+   * system.
+   * The motivations are that:
+   * <ul>
+   *   <li>Upper layers want to pass down logical file ranges.</li>
+   *   <li>Fewer reads have better performance.</li>
+   *   <li>Applications want callbacks as ranges are read.</li>
+   *   <li>Some file systems want to round ranges to be at checksum boundaries.</li>
+   * </ul>
+   *
+   * @param input the list of input ranges
+   * @param chunkSize round the start and end points to multiples of chunkSize
+   * @param minimumSeek the smallest gap that we should seek over in bytes
+   * @param maxSize the largest combined file range in bytes
+   * @return the list of sorted CombinedFileRanges that cover the input
+   */
+  public static List<CombinedFileRange> sortAndMergeRanges(List<? extends FileRange> input,
+                                                           int chunkSize,
+                                                           int minimumSeek,
+                                                           int maxSize) {
+    // sort the ranges by offset
+    FileRange[] ranges = input.toArray(new FileRange[0]);
+    Arrays.sort(ranges, Comparator.comparingLong(FileRange::getOffset));
+    CombinedFileRange current = null;
+    List<CombinedFileRange> result = new ArrayList<>(ranges.length);
+
+    // now merge together the ones that merge
+    for(FileRange range: ranges) {
+      long start = roundDown(range.getOffset(), chunkSize);
+      long end = roundUp(range.getOffset() + range.getLength(), chunkSize);
+      if (current == null || !current.merge(start, end, range, minimumSeek, maxSize)) {
+        current = new CombinedFileRange(start, end, range);
+        result.add(current);
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Slice the data that was read to the user's request.
+   * This function assumes that the user's request is completely subsumed by the
+   * read data.
+   * @param readData the buffer with the readData
+   * @param readOffset the offset in the file for the readData
+   * @param request the user's request
+   * @return the readData buffer that is sliced to the user's request
+   */
+  public static ByteBuffer sliceTo(ByteBuffer readData, long readOffset,
+                                   FileRange request) {
+    int offsetChange = (int) (request.getOffset() - readOffset);
+    int requestLength = request.getLength();
+    // If we need to change the offset or length, make a copy and do it
+    if (offsetChange != 0 || readData.remaining() != requestLength) {
+      readData = readData.slice();
+      readData.position(offsetChange);
+      readData.limit(offsetChange + requestLength);
+    }
+    return readData;
+  }
+
+  private AsyncReaderUtils() {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/CombinedFileRange.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/CombinedFileRange.java
new file mode 100644
index 0000000000000..05d844a198577
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/CombinedFileRange.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A file range that represents a set of underlying file ranges.
+ * This is used when we combine the user's FileRange objects
+ * together into a single read for efficiency.
+ */
+public class CombinedFileRange extends FileRangeImpl {
+  private ArrayList<FileRange> underlying = new ArrayList<>();
+
+  public CombinedFileRange(long offset, long end, FileRange original) {
+    super(offset, (int) (end - offset));
+    this.underlying.add(original);
+  }
+
+  /**
+   * Get the list of ranges that were merged together to form this one.
+   * @return the list of input ranges
+   */
+  public List<FileRange> getUnderlying() {
+    return underlying;
+  }
+
+  /**
+   * Merge this input range into the current one, if it is compatible.
+   * It is assumed that otherOffset is greater or equal the current offset,
+   * which typically happens by sorting the input ranges on offset.
+   * @param otherOffset the offset to consider merging
+   * @param otherEnd the end to consider merging
+   * @param other the underlying FileRange to add if we merge
+   * @param minSeek the minimum distance that we'll seek without merging the
+   *                ranges together
+   * @param maxSize the maximum size that we'll merge into a single range
+   * @return true if we have merged the range into this one
+   */
+  public boolean merge(long otherOffset, long otherEnd, FileRange other,
+                       int minSeek, int maxSize) {
+    long end = offset + length;
+    long newEnd = Math.max(end, otherEnd);
+    if (otherOffset - end >= minSeek || newEnd - offset > maxSize) {
+      return false;
+    }
+    length = (int) (newEnd - offset);
+    underlying.add(other);
+    return true;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java
new file mode 100644
index 0000000000000..6fd7398cf362c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+import org.apache.hadoop.fs.PositionedReadable;
+import org.junit.Test;
+import org.mockito.ArgumentMatchers;
+import org.mockito.Mockito;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.IntFunction;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test behavior of {@link AsyncReaderUtils}.
+ */
+public class TestAsyncReaderUtils  {
+
+  @Test
+  public void testSliceTo() {
+    final int SIZE = 64 * 1024;
+    ByteBuffer buffer = ByteBuffer.allocate(SIZE);
+    // fill the buffer with data
+    IntBuffer intBuffer = buffer.asIntBuffer();
+    for(int i=0; i < SIZE / Integer.BYTES; ++i) {
+      intBuffer.put(i);
+    }
+    // ensure we don't make unnecessary slices
+    ByteBuffer slice = AsyncReaderUtils.sliceTo(buffer, 100,
+        new FileRangeImpl(100, SIZE));
+    assertSame(buffer, slice);
+
+    // try slicing a range
+    final int OFFSET = 100;
+    final int SLICE_START = 1024;
+    final int SLICE_LENGTH = 16 * 1024;
+    slice = AsyncReaderUtils.sliceTo(buffer, OFFSET,
+        new FileRangeImpl(OFFSET + SLICE_START, SLICE_LENGTH));
+    // make sure they aren't the same, but use the same backing data
+    assertNotSame(buffer, slice);
+    assertSame(buffer.array(), slice.array());
+    // test the contents of the slice
+    intBuffer = slice.asIntBuffer();
+    for(int i=0; i < SLICE_LENGTH / Integer.BYTES; ++i) {
+      assertEquals("i = " + i, i + SLICE_START / Integer.BYTES, intBuffer.get());
+    }
+  }
+
+  @Test
+  public void testRounding() {
+    for(int i=5; i < 10; ++i) {
+      assertEquals("i = "+ i, 5, AsyncReaderUtils.roundDown(i, 5));
+      assertEquals("i = "+ i, 10, AsyncReaderUtils.roundUp(i+1, 5));
+    }
+    assertEquals(13, AsyncReaderUtils.roundDown(13, 1));
+    assertEquals(13, AsyncReaderUtils.roundUp(13, 1));
+  }
+
+  @Test
+  public void testMerge() {
+    FileRange base = new FileRangeImpl(2000, 1000);
+    CombinedFileRange mergeBase = new CombinedFileRange(2000, 3000, base);
+
+    // test when the gap between is too big
+    assertFalse(mergeBase.merge(5000, 6000,
+        new FileRangeImpl(5000, 1000), 2000, 4000));
+    assertEquals(1, mergeBase.getUnderlying().size());
+    assertEquals(2000, mergeBase.getOffset());
+    assertEquals(1000, mergeBase.getLength());
+
+    // test when the total size gets exceeded
+    assertFalse(mergeBase.merge(5000, 6000,
+        new FileRangeImpl(5000, 1000), 2001, 3999));
+    assertEquals(1, mergeBase.getUnderlying().size());
+    assertEquals(2000, mergeBase.getOffset());
+    assertEquals(1000, mergeBase.getLength());
+
+    // test when the merge works
+    assertTrue(mergeBase.merge(5000, 6000,
+        new FileRangeImpl(5000, 1000), 2001, 4000));
+    assertEquals(2, mergeBase.getUnderlying().size());
+    assertEquals(2000, mergeBase.getOffset());
+    assertEquals(4000, mergeBase.getLength());
+
+    // reset the mergeBase and test with a 10:1 reduction
+    mergeBase = new CombinedFileRange(200, 300, base);
+    assertEquals(200, mergeBase.getOffset());
+    assertEquals(100, mergeBase.getLength());
+    assertTrue(mergeBase.merge(500, 600,
+        new FileRangeImpl(5000, 1000), 201, 400));
+    assertEquals(2, mergeBase.getUnderlying().size());
+    assertEquals(200, mergeBase.getOffset());
+    assertEquals(400, mergeBase.getLength());
+  }
+
+  @Test
+  public void testSortAndMerge() {
+    List<FileRange> input = Arrays.asList(
+        new FileRangeImpl(3000, 100),
+        new FileRangeImpl(2100, 100),
+        new FileRangeImpl(1000, 100)
+        );
+    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 800));
+    List<CombinedFileRange> outputList = AsyncReaderUtils.sortAndMergeRanges(
+        input, 100, 1001, 2500);
+    assertEquals(1, outputList.size());
+    CombinedFileRange output = outputList.get(0);
+    assertEquals(3, output.getUnderlying().size());
+    assertEquals("range[1000,3100)", output.toString());
+    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 800));
+
+    // the minSeek doesn't allow the first two to merge
+    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 1000));
+    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 100, 1000, 2100);
+    assertEquals(2, outputList.size());
+    assertEquals("range[1000,1100)", outputList.get(0).toString());
+    assertEquals("range[2100,3100)", outputList.get(1).toString());
+    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 1000));
+
+    // the maxSize doesn't allow the third range to merge
+    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 800));
+    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 100, 1001, 2099);
+    assertEquals(2, outputList.size());
+    assertEquals("range[1000,2200)", outputList.get(0).toString());
+    assertEquals("range[3000,3100)", outputList.get(1).toString());
+    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 800));
+
+    // test the round up and round down (the maxSize doesn't allow any merges)
+    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 16, 700));
+    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 16, 1001, 100);
+    assertEquals(3, outputList.size());
+    assertEquals("range[992,1104)", outputList.get(0).toString());
+    assertEquals("range[2096,2208)", outputList.get(1).toString());
+    assertEquals("range[2992,3104)", outputList.get(2).toString());
+    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 16, 700));
+  }
+
+  interface Stream extends PositionedReadable, ByteBufferPositionedReadable {
+    // nothing
+  }
+
+  static void fillBuffer(ByteBuffer buffer) {
+    byte b = 0;
+    while (buffer.remaining() > 0) {
+      buffer.put(b++);
+    }
+  }
+
+  @Test
+  public void testReadRangeFromByteBufferPositionedReadable() throws Exception {
+    Stream stream = Mockito.mock(Stream.class);
+    Mockito.doAnswer(invocation -> {
+      fillBuffer(invocation.getArgument(1));
+      return null;
+    }).when(stream).readFully(ArgumentMatchers.anyLong(),
+                              ArgumentMatchers.any(ByteBuffer.class));
+    CompletableFuture<ByteBuffer> result =
+        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+        ByteBuffer::allocate);
+    assertTrue(result.isDone());
+    ByteBuffer buffer = result.get();
+    assertEquals(100, buffer.remaining());
+    byte b = 0;
+    while (buffer.remaining() > 0) {
+      assertEquals("remain = " + buffer.remaining(), b++, buffer.get());
+    }
+
+    // test an IOException
+    Mockito.reset(stream);
+    Mockito.doThrow(new IOException("foo"))
+        .when(stream).readFully(ArgumentMatchers.anyLong(),
+                                ArgumentMatchers.any(ByteBuffer.class));
+    result =
+        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+            ByteBuffer::allocate);
+    assertTrue(result.isCompletedExceptionally());
+  }
+
+  static void runReadRangeFromPositionedReadable(IntFunction<ByteBuffer> allocate) throws Exception {
+    PositionedReadable stream = Mockito.mock(PositionedReadable.class);
+    Mockito.doAnswer(invocation -> {
+      byte b=0;
+      byte[] buffer = invocation.getArgument(1);
+      for(int i=0; i < buffer.length; ++i) {
+        buffer[i] = b++;
+      }
+      return null;
+    }).when(stream).readFully(ArgumentMatchers.anyLong(),
+        ArgumentMatchers.any(), ArgumentMatchers.anyInt(),
+        ArgumentMatchers.anyInt());
+    CompletableFuture<ByteBuffer> result =
+        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+            allocate);
+    assertTrue(result.isDone());
+    assertFalse(result.isCompletedExceptionally());
+    ByteBuffer buffer = result.get();
+    assertEquals(100, buffer.remaining());
+    byte b = 0;
+    while (buffer.remaining() > 0) {
+      assertEquals("remain = " + buffer.remaining(), b++, buffer.get());
+    }
+
+    // test an IOException
+    Mockito.reset(stream);
+    Mockito.doThrow(new IOException("foo"))
+        .when(stream).readFully(ArgumentMatchers.anyLong(),
+        ArgumentMatchers.any(), ArgumentMatchers.anyInt(),
+        ArgumentMatchers.anyInt());
+    result =
+        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+            ByteBuffer::allocate);
+    assertTrue(result.isCompletedExceptionally());
+  }
+
+  @Test
+  public void testReadRangeArray() throws Exception {
+    runReadRangeFromPositionedReadable(ByteBuffer::allocate);
+  }
+
+  @Test
+  public void testReadRangeDirect() throws Exception {
+    runReadRangeFromPositionedReadable(ByteBuffer::allocateDirect);
+  }
+
+  static void validateBuffer(String message, ByteBuffer buffer, int start) {
+    byte expected = (byte) start;
+    while (buffer.remaining() > 0) {
+      assertEquals(message + " remain: " + buffer.remaining(), expected++,
+          buffer.get());
+    }
+  }
+
+  @Test
+  public void testReadAsync() throws Exception {
+    List<FileRange> input = Arrays.asList(new FileRangeImpl(0, 100),
+        new FileRangeImpl(100_000, 100),
+        new FileRangeImpl(200_000, 100));
+    Stream stream = Mockito.mock(Stream.class);
+    Mockito.doAnswer(invocation -> {
+      fillBuffer(invocation.getArgument(1));
+      return null;
+    }).when(stream).readFully(ArgumentMatchers.anyLong(),
+        ArgumentMatchers.any(ByteBuffer.class));
+    // should not merge the ranges
+    AsyncReaderUtils.readAsync(stream, input, ByteBuffer::allocate, 100, 100);
+    Mockito.verify(stream, Mockito.times(3))
+        .readFully(ArgumentMatchers.anyLong(), ArgumentMatchers.any(ByteBuffer.class));
+    for(int b=0; b < input.size(); ++b) {
+      validateBuffer("buffer " + b, input.get(b).getData().get(), 0);
+    }
+  }
+
+  @Test
+  public void testReadAsyncMerge() throws Exception {
+    List<FileRange> input = Arrays.asList(new FileRangeImpl(2000, 100),
+        new FileRangeImpl(1000, 100),
+        new FileRangeImpl(0, 100));
+    Stream stream = Mockito.mock(Stream.class);
+    Mockito.doAnswer(invocation -> {
+      fillBuffer(invocation.getArgument(1));
+      return null;
+    }).when(stream).readFully(ArgumentMatchers.anyLong(),
+        ArgumentMatchers.any(ByteBuffer.class));
+    // should merge the ranges into a single read
+    AsyncReaderUtils.readAsync(stream, input, ByteBuffer::allocate, 1000, 2100);
+    Mockito.verify(stream, Mockito.times(1))
+        .readFully(ArgumentMatchers.anyLong(), ArgumentMatchers.any(ByteBuffer.class));
+    for(int b=0; b < input.size(); ++b) {
+      validateBuffer("buffer " + b, input.get(b).getData().get(), (2 - b) * 1000);
+    }
+  }
+}
diff --git a/hadoop-common-project/pom.xml b/hadoop-common-project/pom.xml
index b36dbf30610ff..58ef96a7a380c 100644
--- a/hadoop-common-project/pom.xml
+++ b/hadoop-common-project/pom.xml
@@ -57,4 +57,12 @@
     </plugins>
   </build>
 
+  <profiles>
+    <profile>
+      <id>benchmark</id>
+      <modules>
+        <module>benchmark</module>
+      </modules>
+    </profile>
+  </profiles>
 </project>
diff --git a/pom.xml b/pom.xml
index 5bdf9fd71868f..281c4f09fe0b2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -538,6 +538,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
             <exclude>licenses-binary/**</exclude>
             <exclude>dev-support/docker/pkg-resolver/packages.json</exclude>
             <exclude>dev-support/docker/pkg-resolver/platforms.json</exclude>
+            <exclude>**/target/**</exclude>
          </excludes>
        </configuration>
       </plugin>

From 06d11e75c3a7654307dddc68f78452a7198f3740 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Wed, 20 Oct 2021 17:31:08 +0530
Subject: [PATCH 02/13] async api to throw IOE and basic S3A implementation

---
 .../hadoop/fs/BufferedFSInputStream.java      |  2 +-
 .../apache/hadoop/fs/ChecksumFileSystem.java  |  2 +-
 .../apache/hadoop/fs/FSDataInputStream.java   |  2 +-
 .../apache/hadoop/fs/PositionedReadable.java  |  3 +-
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   |  3 +-
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 63 ++++++++++++++++++-
 .../fs/s3a/TestS3AInputStreamRetry.java       |  3 +-
 7 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
index e958a6bebf5e8..1ff683b622267 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
@@ -178,7 +178,7 @@ public int maximumReadSize() {
 
   @Override
   public void readAsync(List<? extends FileRange> ranges,
-                        IntFunction<ByteBuffer> allocate) {
+                        IntFunction<ByteBuffer> allocate) throws IOException {
     ((PositionedReadable) in).readAsync(ranges, allocate);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index 1cc640a4c9e7d..4ed53a7f9853a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -379,7 +379,7 @@ static ByteBuffer checkBytes(ByteBuffer sumsBytes,
 
     @Override
     public void readAsync(List<? extends FileRange> ranges,
-                          IntFunction<ByteBuffer> allocate) {
+                          IntFunction<ByteBuffer> allocate) throws IOException {
       // If the stream doesn't have checksums, just delegate.
       if (sums == null) {
         datas.readAsync(ranges, allocate);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index 8b704e385a639..4fd2866d759d3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -294,7 +294,7 @@ public int maximumReadSize() {
 
   @Override
   public void readAsync(List<? extends FileRange> ranges,
-                        IntFunction<ByteBuffer> allocate) {
+                        IntFunction<ByteBuffer> allocate) throws IOException {
     ((PositionedReadable) in).readAsync(ranges, allocate);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
index 2e5afad0c0792..4b48450f19339 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
@@ -116,9 +116,10 @@ default int maximumReadSize() {
    * data from the file's range.
    * @param ranges the byte ranges to read
    * @param allocate the function to allocate ByteBuffer
+   * @throws IOException any IOE.
    */
   default void readAsync(List<? extends FileRange> ranges,
-                         IntFunction<ByteBuffer> allocate) {
+                         IntFunction<ByteBuffer> allocate) throws IOException {
     AsyncReaderUtils.readAsync(this, ranges, allocate,  minimumReasonableSeek(),
         maximumReadSize());
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 115abe302f61f..5fbdc4e03b69c 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -1499,7 +1499,8 @@ private FSDataInputStream open(
         new S3AInputStream(
             readContext,
             createObjectAttributes(fileStatus),
-            createInputStreamCallbacks(auditSpan)));
+            createInputStreamCallbacks(auditSpan),
+            boundedThreadPool));
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 3a0b669543edf..8071c3b24d0ef 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -25,6 +25,7 @@
 import com.amazonaws.services.s3.model.S3ObjectInputStream;
 import org.apache.hadoop.classification.VisibleForTesting;
 import org.apache.hadoop.util.Preconditions;
+import org.apache.hadoop.fs.FileRange;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.CanSetReadahead;
@@ -46,6 +47,11 @@
 import java.io.EOFException;
 import java.io.IOException;
 import java.net.SocketTimeoutException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.function.IntFunction;
 
 import static org.apache.commons.lang3.StringUtils.isNotEmpty;
 import static org.apache.hadoop.util.StringUtils.toLowerCase;
@@ -100,6 +106,7 @@ public class S3AInputStream extends FSInputStream implements  CanSetReadahead,
   private S3ObjectInputStream wrappedStream;
   private final S3AReadOpContext context;
   private final InputStreamCallbacks client;
+  private final ExecutorService boundedThreadPool;
   private final String bucket;
   private final String key;
   private final String pathStr;
@@ -145,8 +152,9 @@ public class S3AInputStream extends FSInputStream implements  CanSetReadahead,
    * @param client S3 client to use
    */
   public S3AInputStream(S3AReadOpContext ctx,
-      S3ObjectAttributes s3Attributes,
-      InputStreamCallbacks client) {
+                        S3ObjectAttributes s3Attributes,
+                        InputStreamCallbacks client,
+                        ExecutorService threadPool) {
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getBucket()),
         "No Bucket");
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getKey()), "No Key");
@@ -168,6 +176,7 @@ public S3AInputStream(S3AReadOpContext ctx,
         s3Attributes);
     setInputPolicy(ctx.getInputPolicy());
     setReadahead(ctx.getReadahead());
+    this.boundedThreadPool = threadPool;
   }
 
   /**
@@ -793,6 +802,56 @@ public void readFully(long position, byte[] buffer, int offset, int length)
     }
   }
 
+  @Override
+  public void readAsync(List<? extends FileRange> ranges,
+                        IntFunction<ByteBuffer> allocate) throws IOException {
+    checkNotClosed();
+    for (FileRange range : ranges) {
+      validateRangeRequest(range);
+      CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
+      range.setData(result);
+      boundedThreadPool.submit(() -> readSingleRange(range, allocate));
+    }
+  }
+
+  /**
+   * Should I move this for FSInputStream for subclasses to use?
+   * I think so. Pls comment.
+   * @param range
+   * @throws EOFException
+   */
+  private void validateRangeRequest(FileRange range) throws EOFException {
+
+    Preconditions.checkArgument(range.getLength() >= 0, "length is negative");
+    if (range.getOffset() < 0) {
+      throw new EOFException("position is negative");
+    }
+  }
+
+  /**
+   * Add retry in client.getObject().
+   * @param range
+   * @param allocate
+   */
+  private void readSingleRange(FileRange range, IntFunction<ByteBuffer> allocate) {
+    try {
+      long position = range.getOffset();
+      int length = range.getLength();
+      ByteBuffer buffer = allocate.apply(length);
+      final GetObjectRequest request = client.newGetRequest(key)
+              .withRange(position, position + range.getLength() - 1);
+      String text = String.format("%s %s at %d",
+              "readAsync", uri, position);
+      S3Object object = Invoker.once(text, uri,
+              () -> client.getObject(request));
+      object.getObjectContent().read(buffer.array(), buffer.arrayOffset(), range.getLength());
+      range.getData().complete(buffer);
+    } catch (IOException ex) {
+      range.getData().completeExceptionally(ex);
+    }
+  }
+
+
   /**
    * Access the input stream statistics.
    * This is for internal testing and may be removed without warning.
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AInputStreamRetry.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AInputStreamRetry.java
index db5b5b56851ea..95194f71661a7 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AInputStreamRetry.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AInputStreamRetry.java
@@ -109,7 +109,8 @@ private S3AInputStream getMockedS3AInputStream() {
     return new S3AInputStream(
         s3AReadOpContext,
         s3ObjectAttributes,
-        getMockedInputStreamCallback());
+        getMockedInputStreamCallback(),
+            null);
   }
 
   /**

From 7970faabee879cb1573c969f1f43d80a7f30a90d Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Wed, 20 Oct 2021 17:45:45 +0530
Subject: [PATCH 03/13] Vectored Read API spec

---
 .../hadoop/fs/BufferedFSInputStream.java      | 14 ++++-----
 .../apache/hadoop/fs/ChecksumFileSystem.java  | 16 +++++-----
 .../apache/hadoop/fs/FSDataInputStream.java   | 14 ++++-----
 .../java/org/apache/hadoop/fs/FileRange.java  |  4 +--
 .../apache/hadoop/fs/PositionedReadable.java  | 12 +++----
 .../apache/hadoop/fs/RawLocalFileSystem.java  |  4 +--
 .../markdown/filesystem/fsdatainputstream.md  | 31 +++++++++++++++++++
 .../apache/hadoop/fs/s3a/S3AInputStream.java  |  4 +--
 8 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
index 1ff683b622267..7f3171235c8f4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
@@ -167,18 +167,18 @@ public String toString() {
   }
 
   @Override
-  public int minimumReasonableSeek() {
-    return ((PositionedReadable) in).minimumReasonableSeek();
+  public int minSeekForVectorReads() {
+    return ((PositionedReadable) in).minSeekForVectorReads();
   }
 
   @Override
-  public int maximumReadSize() {
-    return ((PositionedReadable) in).maximumReadSize();
+  public int maxReadSizeForVectorReads() {
+    return ((PositionedReadable) in).maxReadSizeForVectorReads();
   }
 
   @Override
-  public void readAsync(List<? extends FileRange> ranges,
-                        IntFunction<ByteBuffer> allocate) throws IOException {
-    ((PositionedReadable) in).readAsync(ranges, allocate);
+  public void readVectored(List<? extends FileRange> ranges,
+                           IntFunction<ByteBuffer> allocate) throws IOException {
+    ((PositionedReadable) in).readVectored(ranges, allocate);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index 4ed53a7f9853a..1badeda5113bf 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -378,22 +378,22 @@ static ByteBuffer checkBytes(ByteBuffer sumsBytes,
     }
 
     @Override
-    public void readAsync(List<? extends FileRange> ranges,
-                          IntFunction<ByteBuffer> allocate) throws IOException {
+    public void readVectored(List<? extends FileRange> ranges,
+                             IntFunction<ByteBuffer> allocate) throws IOException {
       // If the stream doesn't have checksums, just delegate.
       if (sums == null) {
-        datas.readAsync(ranges, allocate);
+        datas.readVectored(ranges, allocate);
         return;
       }
-      int minSeek = minimumReasonableSeek();
-      int maxSize = maximumReadSize();
+      int minSeek = minSeekForVectorReads();
+      int maxSize = maxReadSizeForVectorReads();
       List<CombinedFileRange> dataRanges =
           AsyncReaderUtils.sortAndMergeRanges(ranges, bytesPerSum,
-              minSeek, maximumReadSize());
+              minSeek, maxReadSizeForVectorReads());
       List<CombinedFileRange> checksumRanges = findChecksumRanges(dataRanges,
           bytesPerSum, minSeek, maxSize);
-      sums.readAsync(checksumRanges, allocate);
-      datas.readAsync(dataRanges, allocate);
+      sums.readVectored(checksumRanges, allocate);
+      datas.readVectored(dataRanges, allocate);
       for(CombinedFileRange checksumRange: checksumRanges) {
         for(FileRange dataRange: checksumRange.getUnderlying()) {
           // when we have both the ranges, validate the checksum
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index 4fd2866d759d3..52644402ca459 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -283,18 +283,18 @@ public IOStatistics getIOStatistics() {
   }
 
   @Override
-  public int minimumReasonableSeek() {
-    return ((PositionedReadable) in).minimumReasonableSeek();
+  public int minSeekForVectorReads() {
+    return ((PositionedReadable) in).minSeekForVectorReads();
   }
 
   @Override
-  public int maximumReadSize() {
-    return ((PositionedReadable) in).maximumReadSize();
+  public int maxReadSizeForVectorReads() {
+    return ((PositionedReadable) in).maxReadSizeForVectorReads();
   }
 
   @Override
-  public void readAsync(List<? extends FileRange> ranges,
-                        IntFunction<ByteBuffer> allocate) throws IOException {
-    ((PositionedReadable) in).readAsync(ranges, allocate);
+  public void readVectored(List<? extends FileRange> ranges,
+                           IntFunction<ByteBuffer> allocate) throws IOException {
+    ((PositionedReadable) in).readVectored(ranges, allocate);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
index 6490f1599b76f..416f419ad7830 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileRange.java
@@ -23,7 +23,7 @@
 /**
  * A byte range of a file.
  * This is used for the asynchronous gather read API of
- * {@link PositionedReadable#readAsync}.
+ * {@link PositionedReadable#readVectored}.
  */
 public interface FileRange {
   /**
@@ -46,7 +46,7 @@ public interface FileRange {
 
   /**
    * Set a future for this range's data.
-   * This method is called by {@link PositionedReadable#readAsync} to store the
+   * This method is called by {@link PositionedReadable#readVectored} to store the
    * data for the user to pick up later via {@link #getData}.
    * @param data the future of the ByteBuffer that will have the data
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
index 4b48450f19339..11e644917a53e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
@@ -94,7 +94,7 @@ void readFully(long position, byte[] buffer, int offset, int length)
    * What is the smallest reasonable seek?
    * @return the minimum number of bytes
    */
-  default int minimumReasonableSeek() {
+  default int minSeekForVectorReads() {
     return 4 * 1024;
   }
 
@@ -102,7 +102,7 @@ default int minimumReasonableSeek() {
    * What is the largest size that we should group ranges together as?
    * @return the number of bytes to read at once
    */
-  default int maximumReadSize() {
+  default int maxReadSizeForVectorReads() {
     return 1024 * 1024;
   }
 
@@ -118,9 +118,9 @@ default int maximumReadSize() {
    * @param allocate the function to allocate ByteBuffer
    * @throws IOException any IOE.
    */
-  default void readAsync(List<? extends FileRange> ranges,
-                         IntFunction<ByteBuffer> allocate) throws IOException {
-    AsyncReaderUtils.readAsync(this, ranges, allocate,  minimumReasonableSeek(),
-        maximumReadSize());
+  default void readVectored(List<? extends FileRange> ranges,
+                            IntFunction<ByteBuffer> allocate) throws IOException {
+    AsyncReaderUtils.readAsync(this, ranges, allocate,  minSeekForVectorReads(),
+        maxReadSizeForVectorReads());
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
index 1b7538db6d580..4c53294f6b0c5 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
@@ -293,8 +293,8 @@ AsynchronousFileChannel getAsyncChannel() throws IOException {
     }
 
     @Override
-    public void readAsync(List<? extends FileRange> ranges,
-                          IntFunction<ByteBuffer> allocate) {
+    public void readVectored(List<? extends FileRange> ranges,
+                             IntFunction<ByteBuffer> allocate) {
       // Set up all of the futures, so that we can use them if things fail
       for(FileRange range: ranges) {
         range.setData(new CompletableFuture<>());
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
index 090696483be34..42897ee901bc0 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
@@ -443,6 +443,37 @@ The semantics of this are exactly equivalent to
 That is, the buffer is filled entirely with the contents of the input source
 from position `position`
 
+### `default void readAsync(List<? extends FileRange> ranges, IntFunction<ByteBuffer> allocate)`
+
+Read fully data for a list of ranges asynchronously. The default implementation
+iterates through the ranges, tries to coalesce the ranges based on values of
+`minSeekForVectorReads` and `maxReadSizeForVectorReads` and then read each merged 
+ranges synchronously, but the intent is sub classes can implement efficient
+implementation.
+
+#### Preconditions
+
+For each requested range:
+
+    range.getOffset >= 0 else raise IllegalArgumentException
+    range.getLength >= 0 else raise EOFException
+
+#### Postconditions
+
+For each requested range:
+
+    range.getData() returns CompletableFuture<ByteBuffer> which will have data
+    from range.getOffset to range.getLength. 
+
+### `minSeekForVectorReads()`
+
+Smallest reasonable seek. Two ranges won't be merged together if the difference between
+end of first and start of next range is more than this value.
+
+### `maxReadSizeForVectorReads()`
+
+Maximum number of bytes which can be read in one go after merging the ranges.
+Two ranges won't be merged if the combined data to be read is more than this value.
 
 ## Consistency
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 8071c3b24d0ef..d0015f7098f8a 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -803,8 +803,8 @@ public void readFully(long position, byte[] buffer, int offset, int length)
   }
 
   @Override
-  public void readAsync(List<? extends FileRange> ranges,
-                        IntFunction<ByteBuffer> allocate) throws IOException {
+  public void readVectored(List<? extends FileRange> ranges,
+                           IntFunction<ByteBuffer> allocate) throws IOException {
     checkNotClosed();
     for (FileRange range : ranges) {
       validateRangeRequest(range);

From fd20570cb212db5ff8aea9c50370cd7c01bc7ac3 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Fri, 12 Nov 2021 12:50:16 +0530
Subject: [PATCH 04/13] Adding contract tests for vectored read API

---
 .../hadoop/benchmark/AsyncBenchmark.java      |   2 +-
 .../apache/hadoop/fs/ChecksumFileSystem.java  |   7 +
 .../apache/hadoop/fs/RawLocalFileSystem.java  |   8 +-
 .../hadoop/fs/impl/AsyncReaderUtils.java      |  25 +++
 .../AbstractContractVectoredReadTest.java     | 193 ++++++++++++++++++
 .../TestLocalFSConractVectoredRead.java       |  17 ++
 .../TestRawLocalContractVectoredRead.java     |  17 ++
 .../apache/hadoop/fs/s3a/S3AInputStream.java  |  22 +-
 .../s3a/ITestS3AContractVectoredRead.java     |  36 ++++
 9 files changed, 317 insertions(+), 10 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSConractVectoredRead.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/rawlocal/TestRawLocalContractVectoredRead.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java

diff --git a/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java b/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
index a576acc18f7a9..9c6f56ff65e7f 100644
--- a/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
+++ b/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
@@ -108,7 +108,7 @@ public void asyncRead(FileSystemChoice fsChoice,
       FileRangeImpl range = new FileRangeImpl(m * 1024L * 1024, 64 * 1024);
       ranges.add(range);
     }
-    stream.readAsync(ranges, bufferChoice.allocate);
+    stream.readVectored(ranges, bufferChoice.allocate);
     for(FileRange range: ranges) {
       blackhole.consume(range.getData().get());
     }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index 1badeda5113bf..aefaa2b751df7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -32,6 +32,7 @@
 import java.util.List;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
 import java.util.function.IntFunction;
 import java.util.zip.CRC32;
 
@@ -381,6 +382,7 @@ static ByteBuffer checkBytes(ByteBuffer sumsBytes,
     public void readVectored(List<? extends FileRange> ranges,
                              IntFunction<ByteBuffer> allocate) throws IOException {
       // If the stream doesn't have checksums, just delegate.
+      AsyncReaderUtils.validateVectoredReadRanges(ranges);
       if (sums == null) {
         datas.readVectored(ranges, allocate);
         return;
@@ -394,6 +396,11 @@ public void readVectored(List<? extends FileRange> ranges,
           bytesPerSum, minSeek, maxSize);
       sums.readVectored(checksumRanges, allocate);
       datas.readVectored(dataRanges, allocate);
+      // Data read is correct. I have verified content of dataRanges.
+      // There is some bug below here as test (testVectoredReadMultipleRanges)
+      // is failing, should be
+      // somewhere while slicing the merged data into smaller user ranges.
+      // Spend some time figuring out but it is a complex code.
       for(CombinedFileRange checksumRange: checksumRanges) {
         for(FileRange dataRange: checksumRange.getUnderlying()) {
           // when we have both the ranges, validate the checksum
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
index 4c53294f6b0c5..7300ed69e3456 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
@@ -20,6 +20,7 @@
 package org.apache.hadoop.fs;
 
 import org.apache.hadoop.classification.VisibleForTesting;
+import org.apache.hadoop.fs.impl.AsyncReaderUtils;
 
 import java.io.BufferedOutputStream;
 import java.io.DataOutput;
@@ -294,9 +295,11 @@ AsynchronousFileChannel getAsyncChannel() throws IOException {
 
     @Override
     public void readVectored(List<? extends FileRange> ranges,
-                             IntFunction<ByteBuffer> allocate) {
+                             IntFunction<ByteBuffer> allocate) throws IOException {
+
       // Set up all of the futures, so that we can use them if things fail
       for(FileRange range: ranges) {
+        AsyncReaderUtils.validateRangeRequest(range);
         range.setData(new CompletableFuture<>());
       }
       try {
@@ -309,7 +312,7 @@ public void readVectored(List<? extends FileRange> ranges,
           channel.read(buffers[i], range.getOffset(), i, asyncHandler);
         }
       } catch (IOException ioe) {
-        LOG.info("Can't get async channel", ioe);
+        LOG.error("Exception occurred during vectored read ", ioe);
         for(FileRange range: ranges) {
           range.getData().completeExceptionally(ioe);
         }
@@ -356,6 +359,7 @@ public void completed(Integer result, Integer r) {
 
     @Override
     public void failed(Throwable exc, Integer r) {
+      LOG.error("Failed while reading range {} ", r, exc);
       ranges.get(r).getData().completeExceptionally(exc);
     }
   }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
index 39ba2803ecca5..132f6e78b7645 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.fs.impl;
 
+import java.io.EOFException;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
@@ -30,8 +31,32 @@
 import org.apache.hadoop.fs.ByteBufferPositionedReadable;
 import org.apache.hadoop.fs.FileRange;
 import org.apache.hadoop.fs.PositionedReadable;
+import org.apache.hadoop.util.Preconditions;
 
 public class AsyncReaderUtils {
+
+  /**
+   * @param range
+   * @throws EOFException
+   */
+  public static void validateRangeRequest(FileRange range)
+          throws EOFException {
+
+    Preconditions.checkArgument(range.getLength() >= 0, "length is negative");
+    if (range.getOffset() < 0) {
+      throw new EOFException("position is negative");
+    }
+  }
+
+  public static void validateVectoredReadRanges(List<? extends FileRange> ranges)
+          throws EOFException {
+    for (FileRange range : ranges) {
+      validateRangeRequest(range);
+    }
+  }
+
+
+
   /**
    * Read fully a list of file ranges asynchronously from this file.
    * The default iterates through the ranges to read each synchronously, but
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
new file mode 100644
index 0000000000000..df4ad2c061836
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -0,0 +1,193 @@
+package org.apache.hadoop.fs.contract;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.function.IntFunction;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
+
+@RunWith(Parameterized.class)
+public abstract class AbstractContractVectoredReadTest extends AbstractFSContractTestBase {
+
+  private static final Logger LOG = LoggerFactory.getLogger(AbstractContractVectoredReadTest.class);
+
+  public static final int DATASET_LEN = 1024;
+  private static final byte[] DATASET = ContractTestUtils.dataset(DATASET_LEN, 'a', 32);
+  private static final String VECTORED_READ_FILE_NAME = "vectored_file.txt";
+
+  private IntFunction<ByteBuffer> allocate;
+
+  private String bufferType;
+
+  @Parameterized.Parameters
+  public static List<String> params() {
+    return Arrays.asList("direct", "array");
+  }
+
+  public AbstractContractVectoredReadTest(String bufferType) {
+    this.bufferType = bufferType;
+    this.allocate = "array".equals(bufferType) ?
+            ByteBuffer::allocate : ByteBuffer::allocateDirect;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    Path path = path(VECTORED_READ_FILE_NAME);
+    FileSystem fs = getFileSystem();
+    createFile(fs, path, true, DATASET);
+  }
+
+  @Test
+  public void testVectoredReadMultipleRanges() throws Exception {
+    describe("Running with buffer type : " + bufferType);
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    for (int i = 0; i < 10; i++) {
+      FileRange fileRange = new FileRangeImpl(i * 100, 100);
+      fileRanges.add(fileRange);
+    }
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      CompletableFuture<?>[] completableFutures = new CompletableFuture<?>[fileRanges.size()];
+      int i = 0;
+      for (FileRange res : fileRanges) {
+        completableFutures[i++] = res.getData();
+      }
+      CompletableFuture<Void> combinedFuture = CompletableFuture.allOf(completableFutures);
+      combinedFuture.get();
+
+      for (FileRange res : fileRanges) {
+        CompletableFuture<ByteBuffer> data = res.getData();
+        try {
+          ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
+          LOG.info("Returned data from offset {} : {} ", res.getOffset(),
+                  Arrays.toString(buffer.array()));
+          //assertDatasetEquals((int) res.getOffset(), "readAsync", buffer, res.getLength());
+        } catch (Exception ex) {
+          LOG.error("Exception while running vectored read ", ex);
+          //Assert.fail("Exception while running vectored read " + ex);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testOverlappingRanges() throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(0, 100));
+    fileRanges.add(new FileRangeImpl(90, 50));
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      for (FileRange res : fileRanges) {
+        CompletableFuture<ByteBuffer> data = res.getData();
+        try {
+          ByteBuffer buffer = data.get();
+          assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength());
+        } catch (Exception ex) {
+          LOG.error("Exception while running vectored read ", ex);
+          Assert.fail("Exception while running vectored read " + ex);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testEOFRanges()  throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(DATASET_LEN, 100));
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      for (FileRange res : fileRanges) {
+        CompletableFuture<ByteBuffer> data = res.getData();
+        try {
+          ByteBuffer buffer = data.get();
+          // Shouldn't reach here.
+          Assert.fail("EOFException must be thrown while reading EOF");
+        } catch (ExecutionException ex) {
+          // ignore as expected.
+        } catch (Exception ex) {
+          LOG.error("Exception while running vectored read ", ex);
+          Assert.fail("Exception while running vectored read " + ex);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void testNegativeLengthRange()  throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(0, -50));
+    testExceptionalVectoredRead(fs, fileRanges, "Exception is expected");
+  }
+
+  @Test
+  public void testNegativeOffsetRange()  throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(-1, 50));
+    testExceptionalVectoredRead(fs, fileRanges, "Exception is expected");
+  }
+
+  protected void testExceptionalVectoredRead(FileSystem fs,
+                                             List<FileRange> fileRanges,
+                                             String s) throws IOException {
+    boolean exRaised = false;
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      // Can we intercept here as done in S3 tests ??
+      in.readVectored(fileRanges, ByteBuffer::allocate);
+    } catch (EOFException | IllegalArgumentException ex) {
+      // expected.
+      exRaised = true;
+    }
+    Assertions.assertThat(exRaised)
+            .describedAs(s)
+            .isTrue();
+  }
+
+  /**
+   * Assert that the data read matches the dataset at the given offset.
+   * This helps verify that the seek process is moving the read pointer
+   * to the correct location in the file.
+   *
+   * @param readOffset the offset in the file where the read began.
+   * @param operation  operation name for the assertion.
+   * @param data       data read in.
+   * @param length     length of data to check.
+   */
+  private void assertDatasetEquals(
+          final int readOffset, final String operation,
+          final ByteBuffer data,
+          int length) {
+    for (int i = 0; i < length; i++) {
+      int o = readOffset + i;
+      assertEquals(operation + " with read offset " + readOffset
+                      + ": data[" + i + "] != DATASET[" + o + "]",
+              DATASET[o], data.get(i));
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSConractVectoredRead.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSConractVectoredRead.java
new file mode 100644
index 0000000000000..264b6e202f8ce
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSConractVectoredRead.java
@@ -0,0 +1,17 @@
+package org.apache.hadoop.fs.contract.localfs;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.contract.AbstractContractVectoredReadTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+
+public class TestLocalFSConractVectoredRead extends AbstractContractVectoredReadTest {
+
+  public TestLocalFSConractVectoredRead(String bufferType) {
+    super(bufferType);
+  }
+
+  @Override
+  protected AbstractFSContract createContract(Configuration conf) {
+    return new LocalFSContract(conf);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/rawlocal/TestRawLocalContractVectoredRead.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/rawlocal/TestRawLocalContractVectoredRead.java
new file mode 100644
index 0000000000000..b5bd8791c89ed
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/rawlocal/TestRawLocalContractVectoredRead.java
@@ -0,0 +1,17 @@
+package org.apache.hadoop.fs.contract.rawlocal;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.contract.AbstractContractVectoredReadTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+
+public class TestRawLocalContractVectoredRead extends AbstractContractVectoredReadTest {
+
+  public TestRawLocalContractVectoredRead(String bufferType) {
+    super(bufferType);
+  }
+
+  @Override
+  protected AbstractFSContract createContract(Configuration conf) {
+    return new RawlocalFSContract(conf);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index d0015f7098f8a..253e31012df13 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -26,6 +26,7 @@
 import org.apache.hadoop.classification.VisibleForTesting;
 import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.impl.AsyncReaderUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.CanSetReadahead;
@@ -815,16 +816,14 @@ public void readVectored(List<? extends FileRange> ranges,
   }
 
   /**
-   * Should I move this for FSInputStream for subclasses to use?
-   * I think so. Pls comment.
+   * Validates range parameters.
    * @param range
    * @throws EOFException
    */
   private void validateRangeRequest(FileRange range) throws EOFException {
-
-    Preconditions.checkArgument(range.getLength() >= 0, "length is negative");
-    if (range.getOffset() < 0) {
-      throw new EOFException("position is negative");
+    AsyncReaderUtils.validateRangeRequest(range);
+    if(range.getOffset() + range.getLength() > contentLength) {
+      throw new EOFException("Requested range is beyond EOF");
     }
   }
 
@@ -844,9 +843,18 @@ private void readSingleRange(FileRange range, IntFunction<ByteBuffer> allocate)
               "readAsync", uri, position);
       S3Object object = Invoker.once(text, uri,
               () -> client.getObject(request));
-      object.getObjectContent().read(buffer.array(), buffer.arrayOffset(), range.getLength());
+      S3ObjectInputStream objectContent = object.getObjectContent();
+      if (buffer.isDirect()) {
+        byte[] tmp = new byte[length];
+        objectContent.read(tmp, 0, length);
+        buffer.put(tmp);
+        buffer.flip();
+      } else {
+        objectContent.read(buffer.array(), buffer.arrayOffset(), range.getLength());
+      }
       range.getData().complete(buffer);
     } catch (IOException ex) {
+      LOG.error("Exception while reading a range {} ", range, ex);
       range.getData().completeExceptionally(ex);
     }
   }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java
new file mode 100644
index 0000000000000..41148100b6203
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractVectoredRead.java
@@ -0,0 +1,36 @@
+package org.apache.hadoop.fs.contract.s3a;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.contract.AbstractContractVectoredReadTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class ITestS3AContractVectoredRead extends AbstractContractVectoredReadTest {
+
+  public ITestS3AContractVectoredRead(String bufferType) {
+    super(bufferType);
+  }
+
+  @Override
+  protected AbstractFSContract createContract(Configuration conf) {
+    return new S3AContract(conf);
+  }
+
+  /**
+   * Overriding in S3 vectored read api fails fast in case of EOF
+   * requested range.
+   * @throws Exception
+   */
+  @Override
+  public void testEOFRanges() throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(DATASET_LEN, 100));
+    testExceptionalVectoredRead(fs, fileRanges, "EOFException is expected");
+  }
+}

From 65edb35bfcc35f9a0d2cdbfe41d682da79455aa4 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Fri, 12 Nov 2021 16:25:20 +0530
Subject: [PATCH 05/13] Move benchmark test to hadoop-tools module

---
 hadoop-project/pom.xml                        | 11 ++++++++++
 .../hadoop-benchmark}/pom.xml                 | 12 ++---------
 .../src/main/assembly/uber.xml                |  0
 .../src/main/findbugs/exclude.xml             |  0
 .../benchmark/VectoredReadBenchmark.java      | 21 +++++++++----------
 hadoop-tools/pom.xml                          |  1 +
 6 files changed, 24 insertions(+), 21 deletions(-)
 rename {hadoop-common-project/benchmark => hadoop-tools/hadoop-benchmark}/pom.xml (85%)
 rename {hadoop-common-project/benchmark => hadoop-tools/hadoop-benchmark}/src/main/assembly/uber.xml (100%)
 rename {hadoop-common-project/benchmark => hadoop-tools/hadoop-benchmark}/src/main/findbugs/exclude.xml (100%)
 rename hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java => hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java (98%)

diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index ca6886641fdb2..725899ef69ef3 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -212,6 +212,7 @@
     <json-smart.version>2.4.7</json-smart.version>
     <nimbus-jose-jwt.version>9.8.1</nimbus-jose-jwt.version>
     <apache-ant.version>1.10.11</apache-ant.version>
+    <jmh.version>1.20</jmh.version>
   </properties>
 
   <dependencyManagement>
@@ -1545,6 +1546,16 @@
          </exclusion>
        </exclusions>
      </dependency>
+      <dependency>
+        <groupId>org.openjdk.jmh</groupId>
+        <artifactId>jmh-core</artifactId>
+        <version>${jmh.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.openjdk.jmh</groupId>
+        <artifactId>jmh-generator-annprocess</artifactId>
+        <version>${jmh.version}</version>
+      </dependency>
      <dependency>
        <groupId>org.apache.curator</groupId>
        <artifactId>curator-test</artifactId>
diff --git a/hadoop-common-project/benchmark/pom.xml b/hadoop-tools/hadoop-benchmark/pom.xml
similarity index 85%
rename from hadoop-common-project/benchmark/pom.xml
rename to hadoop-tools/hadoop-benchmark/pom.xml
index 19999781503fc..5682ff96da743 100644
--- a/hadoop-common-project/benchmark/pom.xml
+++ b/hadoop-tools/hadoop-benchmark/pom.xml
@@ -23,7 +23,7 @@
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
     <version>3.4.0-SNAPSHOT</version>
-    <relativePath>../../hadoop-project</relativePath>
+    <relativePath>../../hadoop-project/pom.xml</relativePath>
   </parent>
   <artifactId>hadoop-benchmark</artifactId>
   <version>3.4.0-SNAPSHOT</version>
@@ -32,12 +32,6 @@
   <name>Apache Hadoop Common Benchmark</name>
   <description>Apache Hadoop Common Benchmark</description>
 
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <maven.compiler.useIncrementalCompilation>false</maven.compiler.useIncrementalCompilation>
-    <jmh.version>1.20</jmh.version>
-  </properties>
-
   <dependencies>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
@@ -46,12 +40,10 @@
     <dependency>
       <groupId>org.openjdk.jmh</groupId>
       <artifactId>jmh-core</artifactId>
-      <version>${jmh.version}</version>
     </dependency>
     <dependency>
       <groupId>org.openjdk.jmh</groupId>
       <artifactId>jmh-generator-annprocess</artifactId>
-      <version>${jmh.version}</version>
     </dependency>
   </dependencies>
 
@@ -62,7 +54,7 @@
         <configuration>
           <archive>
             <manifest>
-              <mainClass>org.apache.hadoop.benchmark.AsyncBenchmark</mainClass>
+              <mainClass>org.apache.hadoop.benchmark.VectoredReadBenchmark</mainClass>
             </manifest>
           </archive>
           <descriptors>
diff --git a/hadoop-common-project/benchmark/src/main/assembly/uber.xml b/hadoop-tools/hadoop-benchmark/src/main/assembly/uber.xml
similarity index 100%
rename from hadoop-common-project/benchmark/src/main/assembly/uber.xml
rename to hadoop-tools/hadoop-benchmark/src/main/assembly/uber.xml
diff --git a/hadoop-common-project/benchmark/src/main/findbugs/exclude.xml b/hadoop-tools/hadoop-benchmark/src/main/findbugs/exclude.xml
similarity index 100%
rename from hadoop-common-project/benchmark/src/main/findbugs/exclude.xml
rename to hadoop-tools/hadoop-benchmark/src/main/findbugs/exclude.xml
diff --git a/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java b/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
similarity index 98%
rename from hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
rename to hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
index 9c6f56ff65e7f..a5e99dddfae20 100644
--- a/hadoop-common-project/benchmark/src/main/java/org/apache/hadoop/benchmark/AsyncBenchmark.java
+++ b/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
@@ -18,14 +18,6 @@
 
 package org.apache.hadoop.benchmark;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileRange;
-import org.apache.hadoop.fs.FileRangeImpl;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.Path;
-
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Level;
@@ -41,7 +33,6 @@
 
 import java.io.EOFException;
 import java.io.IOException;
-
 import java.nio.ByteBuffer;
 import java.nio.channels.AsynchronousFileChannel;
 import java.nio.channels.CompletionHandler;
@@ -52,9 +43,17 @@
 import java.util.concurrent.TimeUnit;
 import java.util.function.IntFunction;
 
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.FileRangeImpl;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MICROSECONDS)
-public class AsyncBenchmark {
+public class VectoredReadBenchmark {
 
   static final Path DATA_PATH = getTestDataPath();
   static final String DATA_PATH_PROPERTY = "bench.data";
@@ -233,7 +232,7 @@ public void syncRead(FileSystemChoice fsChoice,
    */
   public static void main(String[] args) throws Exception {
     OptionsBuilder opts = new OptionsBuilder();
-    opts.include("AsyncBenchmark");
+    opts.include("VectoredReadBenchmark");
     opts.jvmArgs("-server", "-Xms256m", "-Xmx2g",
         "-D" + DATA_PATH_PROPERTY + "=" + args[0]);
     opts.forks(1);
diff --git a/hadoop-tools/pom.xml b/hadoop-tools/pom.xml
index f026bc261e00b..4e934cd101f85 100644
--- a/hadoop-tools/pom.xml
+++ b/hadoop-tools/pom.xml
@@ -51,6 +51,7 @@
     <module>hadoop-azure-datalake</module>
     <module>hadoop-aliyun</module>
     <module>hadoop-fs2img</module>
+    <module>hadoop-benchmark</module>
   </modules>
 
   <build>

From 8b8dff8e7fd592e9d4efb59ced42f0021579b774 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Tue, 16 Nov 2021 16:42:07 +0530
Subject: [PATCH 06/13] Review comments

---
 hadoop-common-project/hadoop-common/pom.xml   |  8 --
 .../apache/hadoop/fs/ChecksumFileSystem.java  |  9 +-
 .../apache/hadoop/fs/PositionedReadable.java  |  7 +-
 .../apache/hadoop/fs/RawLocalFileSystem.java  | 15 ++--
 ...eaderUtils.java => VectoredReadUtils.java} | 47 +++++++----
 ...rUtils.java => TestVectoredReadUtils.java} | 82 +++++++++++--------
 .../org/apache/hadoop/test/MoreAsserts.java   | 37 +++++++--
 hadoop-common-project/pom.xml                 |  9 --
 .../apache/hadoop/fs/s3a/S3AInputStream.java  |  4 +-
 .../benchmark/VectoredReadBenchmark.java      | 16 ++--
 10 files changed, 134 insertions(+), 100 deletions(-)
 rename hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/{AsyncReaderUtils.java => VectoredReadUtils.java} (86%)
 rename hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/{TestAsyncReaderUtils.java => TestVectoredReadUtils.java} (77%)

diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml
index 71fb2ce353003..bcba2288300f3 100644
--- a/hadoop-common-project/hadoop-common/pom.xml
+++ b/hadoop-common-project/hadoop-common/pom.xml
@@ -651,14 +651,6 @@
           </filesets>
         </configuration>
       </plugin>
-        <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-compiler-plugin</artifactId>
-            <configuration>
-                <source>8</source>
-                <target>8</target>
-            </configuration>
-        </plugin>
     </plugins>
   </build>
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index aefaa2b751df7..e525daef5c8bb 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -32,7 +32,6 @@
 import java.util.List;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CompletionException;
-import java.util.concurrent.ExecutionException;
 import java.util.function.IntFunction;
 import java.util.zip.CRC32;
 
@@ -41,7 +40,7 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
-import org.apache.hadoop.fs.impl.AsyncReaderUtils;
+import org.apache.hadoop.fs.impl.VectoredReadUtils;
 import org.apache.hadoop.fs.impl.CombinedFileRange;
 import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
 import org.apache.hadoop.fs.impl.OpenFileParameters;
@@ -382,7 +381,7 @@ static ByteBuffer checkBytes(ByteBuffer sumsBytes,
     public void readVectored(List<? extends FileRange> ranges,
                              IntFunction<ByteBuffer> allocate) throws IOException {
       // If the stream doesn't have checksums, just delegate.
-      AsyncReaderUtils.validateVectoredReadRanges(ranges);
+      VectoredReadUtils.validateVectoredReadRanges(ranges);
       if (sums == null) {
         datas.readVectored(ranges, allocate);
         return;
@@ -390,7 +389,7 @@ public void readVectored(List<? extends FileRange> ranges,
       int minSeek = minSeekForVectorReads();
       int maxSize = maxReadSizeForVectorReads();
       List<CombinedFileRange> dataRanges =
-          AsyncReaderUtils.sortAndMergeRanges(ranges, bytesPerSum,
+          VectoredReadUtils.sortAndMergeRanges(ranges, bytesPerSum,
               minSeek, maxReadSizeForVectorReads());
       List<CombinedFileRange> checksumRanges = findChecksumRanges(dataRanges,
           bytesPerSum, minSeek, maxSize);
@@ -412,7 +411,7 @@ public void readVectored(List<? extends FileRange> ranges,
           // Now, slice the read data range to the user's ranges
           for(FileRange original: ((CombinedFileRange) dataRange).getUnderlying()) {
             original.setData(result.thenApply(
-                (b) -> AsyncReaderUtils.sliceTo(b, dataRange.getOffset(), original)));
+                (b) -> VectoredReadUtils.sliceTo(b, dataRange.getOffset(), original)));
           }
         }
       }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
index 11e644917a53e..7e543ebf22669 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/PositionedReadable.java
@@ -17,14 +17,15 @@
  */
 package org.apache.hadoop.fs;
 
-import java.io.*;
+import java.io.EOFException;
+import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.List;
 import java.util.function.IntFunction;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.fs.impl.AsyncReaderUtils;
+import org.apache.hadoop.fs.impl.VectoredReadUtils;
 
 /**
  * Stream that permits positional reading.
@@ -120,7 +121,7 @@ default int maxReadSizeForVectorReads() {
    */
   default void readVectored(List<? extends FileRange> ranges,
                             IntFunction<ByteBuffer> allocate) throws IOException {
-    AsyncReaderUtils.readAsync(this, ranges, allocate,  minSeekForVectorReads(),
+    VectoredReadUtils.readVectored(this, ranges, allocate,  minSeekForVectorReads(),
         maxReadSizeForVectorReads());
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
index 7300ed69e3456..ef0abce4ceac4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
@@ -20,7 +20,7 @@
 package org.apache.hadoop.fs;
 
 import org.apache.hadoop.classification.VisibleForTesting;
-import org.apache.hadoop.fs.impl.AsyncReaderUtils;
+import org.apache.hadoop.fs.impl.VectoredReadUtils;
 
 import java.io.BufferedOutputStream;
 import java.io.DataOutput;
@@ -284,11 +284,12 @@ public IOStatistics getIOStatistics() {
       return ioStatistics;
     }
 
-    // QQ: Should we make this synchronized?
     AsynchronousFileChannel getAsyncChannel() throws IOException {
       if (asyncChannel == null) {
-        asyncChannel = AsynchronousFileChannel.open(name.toPath(),
-            StandardOpenOption.READ);
+        synchronized (this) {
+          asyncChannel = AsynchronousFileChannel.open(name.toPath(),
+                  StandardOpenOption.READ);
+        }
       }
       return asyncChannel;
     }
@@ -299,7 +300,7 @@ public void readVectored(List<? extends FileRange> ranges,
 
       // Set up all of the futures, so that we can use them if things fail
       for(FileRange range: ranges) {
-        AsyncReaderUtils.validateRangeRequest(range);
+        VectoredReadUtils.validateRangeRequest(range);
         range.setData(new CompletableFuture<>());
       }
       try {
@@ -312,7 +313,7 @@ public void readVectored(List<? extends FileRange> ranges,
           channel.read(buffers[i], range.getOffset(), i, asyncHandler);
         }
       } catch (IOException ioe) {
-        LOG.error("Exception occurred during vectored read ", ioe);
+        LOG.debug("Exception occurred during vectored read ", ioe);
         for(FileRange range: ranges) {
           range.getData().completeExceptionally(ioe);
         }
@@ -359,7 +360,7 @@ public void completed(Integer result, Integer r) {
 
     @Override
     public void failed(Throwable exc, Integer r) {
-      LOG.error("Failed while reading range {} ", r, exc);
+      LOG.debug("Failed while reading range {} ", r, exc);
       ranges.get(r).getData().completeExceptionally(exc);
     }
   }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/VectoredReadUtils.java
similarity index 86%
rename from hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
rename to hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/VectoredReadUtils.java
index 132f6e78b7645..6d5215f2b1511 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AsyncReaderUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/VectoredReadUtils.java
@@ -33,7 +33,7 @@
 import org.apache.hadoop.fs.PositionedReadable;
 import org.apache.hadoop.util.Preconditions;
 
-public class AsyncReaderUtils {
+public class VectoredReadUtils {
 
   /**
    * @param range
@@ -68,11 +68,11 @@ public static void validateVectoredReadRanges(List<? extends FileRange> ranges)
    * @param minimumSeek the minimum number of bytes to seek over
    * @param maximumRead the largest number of bytes to combine into a single read
    */
-  public static void readAsync(PositionedReadable stream,
-                               List<? extends FileRange> ranges,
-                               IntFunction<ByteBuffer> allocate,
-                               int minimumSeek,
-                               int maximumRead) {
+  public static void readVectored(PositionedReadable stream,
+                                  List<? extends FileRange> ranges,
+                                  IntFunction<ByteBuffer> allocate,
+                                  int minimumSeek,
+                                  int maximumRead) {
     if (isOrderedDisjoint(ranges, 1, minimumSeek)) {
       for(FileRange range: ranges) {
         range.setData(readRangeFrom(stream, range, allocate));
@@ -109,17 +109,7 @@ public static CompletableFuture<ByteBuffer> readRangeFrom(PositionedReadable str
             buffer);
         buffer.flip();
       } else {
-        if (buffer.isDirect()) {
-          // if we need to read data from a direct buffer and the stream doesn't
-          // support it, we allocate a byte array to use.
-          byte[] tmp = new byte[range.getLength()];
-          stream.readFully(range.getOffset(), tmp, 0, tmp.length);
-          buffer.put(tmp);
-          buffer.flip();
-        } else {
-          stream.readFully(range.getOffset(), buffer.array(),
-              buffer.arrayOffset(), range.getLength());
-        }
+        readNonByteBufferPositionedReadable(stream, range, buffer);
       }
       result.complete(buffer);
     } catch (IOException ioe) {
@@ -128,6 +118,27 @@ public static CompletableFuture<ByteBuffer> readRangeFrom(PositionedReadable str
     return result;
   }
 
+  private static void readNonByteBufferPositionedReadable(PositionedReadable stream,
+                                                          FileRange range,
+                                                          ByteBuffer buffer) throws IOException {
+    if (buffer.isDirect()) {
+      buffer.put(readInDirectBuffer(stream, range));
+      buffer.flip();
+    } else {
+      stream.readFully(range.getOffset(), buffer.array(),
+              buffer.arrayOffset(), range.getLength());
+    }
+  }
+
+  private static byte[] readInDirectBuffer(PositionedReadable stream,
+                                           FileRange range) throws IOException {
+    // if we need to read data from a direct buffer and the stream doesn't
+    // support it, we allocate a byte array to use.
+    byte[] tmp = new byte[range.getLength()];
+    stream.readFully(range.getOffset(), tmp, 0, tmp.length);
+    return tmp;
+  }
+
   /**
    * Is the given input list:
    * <ul>
@@ -236,7 +247,7 @@ public static ByteBuffer sliceTo(ByteBuffer readData, long readOffset,
     return readData;
   }
 
-  private AsyncReaderUtils() {
+  private VectoredReadUtils() {
     throw new UnsupportedOperationException();
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
similarity index 77%
rename from hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java
rename to hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
index 6fd7398cf362c..bc32e1033c3f1 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestAsyncReaderUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
@@ -23,6 +23,8 @@
 import org.apache.hadoop.fs.FileRangeImpl;
 import org.apache.hadoop.fs.PositionedReadable;
 import org.junit.Test;
+
+import org.apache.hadoop.test.HadoopTestBase;
 import org.mockito.ArgumentMatchers;
 import org.mockito.Mockito;
 
@@ -34,6 +36,10 @@
 import java.util.concurrent.CompletableFuture;
 import java.util.function.IntFunction;
 
+import org.assertj.core.api.Assertions;
+
+import static org.apache.hadoop.test.MoreAsserts.assertFutureFailedExceptionaly;
+import static org.apache.hadoop.test.MoreAsserts.assertFutureCompletedSuccessfully;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotSame;
@@ -41,9 +47,9 @@
 import static org.junit.Assert.assertTrue;
 
 /**
- * Test behavior of {@link AsyncReaderUtils}.
+ * Test behavior of {@link VectoredReadUtils}.
  */
-public class TestAsyncReaderUtils  {
+public class TestVectoredReadUtils extends HadoopTestBase {
 
   @Test
   public void testSliceTo() {
@@ -55,19 +61,28 @@ public void testSliceTo() {
       intBuffer.put(i);
     }
     // ensure we don't make unnecessary slices
-    ByteBuffer slice = AsyncReaderUtils.sliceTo(buffer, 100,
+    ByteBuffer slice = VectoredReadUtils.sliceTo(buffer, 100,
         new FileRangeImpl(100, SIZE));
-    assertSame(buffer, slice);
+    Assertions.assertThat(buffer)
+            .describedAs("Slicing on the same offset shouldn't " +
+                    "create a new buffer")
+            .isEqualTo(slice);
 
     // try slicing a range
     final int OFFSET = 100;
     final int SLICE_START = 1024;
     final int SLICE_LENGTH = 16 * 1024;
-    slice = AsyncReaderUtils.sliceTo(buffer, OFFSET,
+    slice = VectoredReadUtils.sliceTo(buffer, OFFSET,
         new FileRangeImpl(OFFSET + SLICE_START, SLICE_LENGTH));
     // make sure they aren't the same, but use the same backing data
-    assertNotSame(buffer, slice);
-    assertSame(buffer.array(), slice.array());
+    Assertions.assertThat(buffer)
+            .describedAs("Slicing on new offset should " +
+                    "create a new buffer")
+            .isNotEqualTo(slice);
+    Assertions.assertThat(buffer.array())
+            .describedAs("Slicing should use the same underlying " +
+                    "data")
+            .isEqualTo(slice.array());
     // test the contents of the slice
     intBuffer = slice.asIntBuffer();
     for(int i=0; i < SLICE_LENGTH / Integer.BYTES; ++i) {
@@ -78,11 +93,11 @@ public void testSliceTo() {
   @Test
   public void testRounding() {
     for(int i=5; i < 10; ++i) {
-      assertEquals("i = "+ i, 5, AsyncReaderUtils.roundDown(i, 5));
-      assertEquals("i = "+ i, 10, AsyncReaderUtils.roundUp(i+1, 5));
+      assertEquals("i = "+ i, 5, VectoredReadUtils.roundDown(i, 5));
+      assertEquals("i = "+ i, 10, VectoredReadUtils.roundUp(i+1, 5));
     }
-    assertEquals(13, AsyncReaderUtils.roundDown(13, 1));
-    assertEquals(13, AsyncReaderUtils.roundUp(13, 1));
+    assertEquals(13, VectoredReadUtils.roundDown(13, 1));
+    assertEquals(13, VectoredReadUtils.roundUp(13, 1));
   }
 
   @Test
@@ -129,39 +144,39 @@ public void testSortAndMerge() {
         new FileRangeImpl(2100, 100),
         new FileRangeImpl(1000, 100)
         );
-    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 800));
-    List<CombinedFileRange> outputList = AsyncReaderUtils.sortAndMergeRanges(
+    assertFalse(VectoredReadUtils.isOrderedDisjoint(input, 100, 800));
+    List<CombinedFileRange> outputList = VectoredReadUtils.sortAndMergeRanges(
         input, 100, 1001, 2500);
     assertEquals(1, outputList.size());
     CombinedFileRange output = outputList.get(0);
     assertEquals(3, output.getUnderlying().size());
     assertEquals("range[1000,3100)", output.toString());
-    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 800));
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 100, 800));
 
     // the minSeek doesn't allow the first two to merge
-    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 1000));
-    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 100, 1000, 2100);
+    assertFalse(VectoredReadUtils.isOrderedDisjoint(input, 100, 1000));
+    outputList = VectoredReadUtils.sortAndMergeRanges(input, 100, 1000, 2100);
     assertEquals(2, outputList.size());
     assertEquals("range[1000,1100)", outputList.get(0).toString());
     assertEquals("range[2100,3100)", outputList.get(1).toString());
-    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 1000));
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 100, 1000));
 
     // the maxSize doesn't allow the third range to merge
-    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 100, 800));
-    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 100, 1001, 2099);
+    assertFalse(VectoredReadUtils.isOrderedDisjoint(input, 100, 800));
+    outputList = VectoredReadUtils.sortAndMergeRanges(input, 100, 1001, 2099);
     assertEquals(2, outputList.size());
     assertEquals("range[1000,2200)", outputList.get(0).toString());
     assertEquals("range[3000,3100)", outputList.get(1).toString());
-    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 100, 800));
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 100, 800));
 
     // test the round up and round down (the maxSize doesn't allow any merges)
-    assertFalse(AsyncReaderUtils.isOrderedDisjoint(input, 16, 700));
-    outputList = AsyncReaderUtils.sortAndMergeRanges(input, 16, 1001, 100);
+    assertFalse(VectoredReadUtils.isOrderedDisjoint(input, 16, 700));
+    outputList = VectoredReadUtils.sortAndMergeRanges(input, 16, 1001, 100);
     assertEquals(3, outputList.size());
     assertEquals("range[992,1104)", outputList.get(0).toString());
     assertEquals("range[2096,2208)", outputList.get(1).toString());
     assertEquals("range[2992,3104)", outputList.get(2).toString());
-    assertTrue(AsyncReaderUtils.isOrderedDisjoint(outputList, 16, 700));
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 16, 700));
   }
 
   interface Stream extends PositionedReadable, ByteBufferPositionedReadable {
@@ -184,9 +199,9 @@ public void testReadRangeFromByteBufferPositionedReadable() throws Exception {
     }).when(stream).readFully(ArgumentMatchers.anyLong(),
                               ArgumentMatchers.any(ByteBuffer.class));
     CompletableFuture<ByteBuffer> result =
-        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+        VectoredReadUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
         ByteBuffer::allocate);
-    assertTrue(result.isDone());
+    assertFutureCompletedSuccessfully(result);
     ByteBuffer buffer = result.get();
     assertEquals(100, buffer.remaining());
     byte b = 0;
@@ -200,9 +215,9 @@ public void testReadRangeFromByteBufferPositionedReadable() throws Exception {
         .when(stream).readFully(ArgumentMatchers.anyLong(),
                                 ArgumentMatchers.any(ByteBuffer.class));
     result =
-        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+        VectoredReadUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
             ByteBuffer::allocate);
-    assertTrue(result.isCompletedExceptionally());
+    assertFutureFailedExceptionaly(result);
   }
 
   static void runReadRangeFromPositionedReadable(IntFunction<ByteBuffer> allocate) throws Exception {
@@ -218,10 +233,9 @@ static void runReadRangeFromPositionedReadable(IntFunction<ByteBuffer> allocate)
         ArgumentMatchers.any(), ArgumentMatchers.anyInt(),
         ArgumentMatchers.anyInt());
     CompletableFuture<ByteBuffer> result =
-        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+        VectoredReadUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
             allocate);
-    assertTrue(result.isDone());
-    assertFalse(result.isCompletedExceptionally());
+    assertFutureCompletedSuccessfully(result);
     ByteBuffer buffer = result.get();
     assertEquals(100, buffer.remaining());
     byte b = 0;
@@ -236,9 +250,9 @@ static void runReadRangeFromPositionedReadable(IntFunction<ByteBuffer> allocate)
         ArgumentMatchers.any(), ArgumentMatchers.anyInt(),
         ArgumentMatchers.anyInt());
     result =
-        AsyncReaderUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
+        VectoredReadUtils.readRangeFrom(stream, new FileRangeImpl(1000, 100),
             ByteBuffer::allocate);
-    assertTrue(result.isCompletedExceptionally());
+    assertFutureFailedExceptionaly(result);
   }
 
   @Test
@@ -271,7 +285,7 @@ public void testReadAsync() throws Exception {
     }).when(stream).readFully(ArgumentMatchers.anyLong(),
         ArgumentMatchers.any(ByteBuffer.class));
     // should not merge the ranges
-    AsyncReaderUtils.readAsync(stream, input, ByteBuffer::allocate, 100, 100);
+    VectoredReadUtils.readVectored(stream, input, ByteBuffer::allocate, 100, 100);
     Mockito.verify(stream, Mockito.times(3))
         .readFully(ArgumentMatchers.anyLong(), ArgumentMatchers.any(ByteBuffer.class));
     for(int b=0; b < input.size(); ++b) {
@@ -291,7 +305,7 @@ public void testReadAsyncMerge() throws Exception {
     }).when(stream).readFully(ArgumentMatchers.anyLong(),
         ArgumentMatchers.any(ByteBuffer.class));
     // should merge the ranges into a single read
-    AsyncReaderUtils.readAsync(stream, input, ByteBuffer::allocate, 1000, 2100);
+    VectoredReadUtils.readVectored(stream, input, ByteBuffer::allocate, 1000, 2100);
     Mockito.verify(stream, Mockito.times(1))
         .readFully(ArgumentMatchers.anyLong(), ArgumentMatchers.any(ByteBuffer.class));
     for(int b=0; b < input.size(); ++b) {
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MoreAsserts.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MoreAsserts.java
index 142669b78682e..7dc6ab12a54ea 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MoreAsserts.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/MoreAsserts.java
@@ -19,6 +19,9 @@
 package org.apache.hadoop.test;
 
 import java.util.Iterator;
+import java.util.concurrent.CompletableFuture;
+
+import org.assertj.core.api.Assertions;
 import org.junit.Assert;
 
 /**
@@ -28,17 +31,18 @@ public class MoreAsserts {
 
   /**
    * Assert equivalence for array and iterable
-   * @param <T> the type of the elements
-   * @param s the name/message for the collection
-   * @param expected  the expected array of elements
-   * @param actual    the actual iterable of elements
+   *
+   * @param <T>      the type of the elements
+   * @param s        the name/message for the collection
+   * @param expected the expected array of elements
+   * @param actual   the actual iterable of elements
    */
   public static <T> void assertEquals(String s, T[] expected,
                                       Iterable<T> actual) {
     Iterator<T> it = actual.iterator();
     int i = 0;
     for (; i < expected.length && it.hasNext(); ++i) {
-      Assert.assertEquals("Element "+ i +" for "+ s, expected[i], it.next());
+      Assert.assertEquals("Element " + i + " for " + s, expected[i], it.next());
     }
     Assert.assertTrue("Expected more elements", i == expected.length);
     Assert.assertTrue("Expected less elements", !it.hasNext());
@@ -46,7 +50,8 @@ public static <T> void assertEquals(String s, T[] expected,
 
   /**
    * Assert equality for two iterables
-   * @param <T> the type of the elements
+   *
+   * @param <T>      the type of the elements
    * @param s
    * @param expected
    * @param actual
@@ -57,10 +62,28 @@ public static <T> void assertEquals(String s, Iterable<T> expected,
     Iterator<T> ita = actual.iterator();
     int i = 0;
     while (ite.hasNext() && ita.hasNext()) {
-      Assert.assertEquals("Element "+ i +" for "+s, ite.next(), ita.next());
+      Assert.assertEquals("Element " + i + " for " + s, ite.next(), ita.next());
     }
     Assert.assertTrue("Expected more elements", !ite.hasNext());
     Assert.assertTrue("Expected less elements", !ita.hasNext());
   }
 
+
+  public static <T> void assertFutureCompletedSuccessfully(CompletableFuture<T> future) {
+    Assertions.assertThat(future.isDone())
+            .describedAs("This future is supposed to be " +
+                    "completed successfully")
+            .isTrue();
+    Assertions.assertThat(future.isCompletedExceptionally())
+            .describedAs("This future is supposed to be " +
+                    "completed successfully")
+            .isFalse();
+  }
+
+  public static <T> void assertFutureFailedExceptionaly(CompletableFuture<T> future) {
+    Assertions.assertThat(future.isCompletedExceptionally())
+            .describedAs("This future is supposed to be " +
+                    "completed exceptionally")
+            .isTrue();
+  }
 }
diff --git a/hadoop-common-project/pom.xml b/hadoop-common-project/pom.xml
index 58ef96a7a380c..f167a079a9b0c 100644
--- a/hadoop-common-project/pom.xml
+++ b/hadoop-common-project/pom.xml
@@ -56,13 +56,4 @@
       </plugin>
     </plugins>
   </build>
-
-  <profiles>
-    <profile>
-      <id>benchmark</id>
-      <modules>
-        <module>benchmark</module>
-      </modules>
-    </profile>
-  </profiles>
 </project>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 253e31012df13..dd03ce7f44159 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -26,7 +26,7 @@
 import org.apache.hadoop.classification.VisibleForTesting;
 import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.fs.FileRange;
-import org.apache.hadoop.fs.impl.AsyncReaderUtils;
+import org.apache.hadoop.fs.impl.VectoredReadUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.CanSetReadahead;
@@ -821,7 +821,7 @@ public void readVectored(List<? extends FileRange> ranges,
    * @throws EOFException
    */
   private void validateRangeRequest(FileRange range) throws EOFException {
-    AsyncReaderUtils.validateRangeRequest(range);
+    VectoredReadUtils.validateRangeRequest(range);
     if(range.getOffset() + range.getLength() > contentLength) {
       throw new EOFException("Requested range is beyond EOF");
     }
diff --git a/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java b/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
index a5e99dddfae20..2abcdd12daece 100644
--- a/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
+++ b/hadoop-tools/hadoop-benchmark/src/main/java/org/apache/hadoop/benchmark/VectoredReadBenchmark.java
@@ -57,6 +57,9 @@ public class VectoredReadBenchmark {
 
   static final Path DATA_PATH = getTestDataPath();
   static final String DATA_PATH_PROPERTY = "bench.data";
+  static final int READ_SIZE = 64 * 1024;
+  static final long SEEK_SIZE = 1024L * 1024;
+
 
   static Path getTestDataPath() {
     String value = System.getProperty(DATA_PATH_PROPERTY);
@@ -104,7 +107,7 @@ public void asyncRead(FileSystemChoice fsChoice,
     FSDataInputStream stream = fsChoice.fs.open(DATA_PATH);
     List<FileRange> ranges = new ArrayList<>();
     for(int m=0; m < 100; ++m) {
-      FileRangeImpl range = new FileRangeImpl(m * 1024L * 1024, 64 * 1024);
+      FileRangeImpl range = new FileRangeImpl(m * SEEK_SIZE, READ_SIZE);
       ranges.add(range);
     }
     stream.readVectored(ranges, bufferChoice.allocate);
@@ -199,11 +202,10 @@ public void asyncFileChanArray(BufferChoice bufferChoice,
     AsynchronousFileChannel channel = AsynchronousFileChannel.open(path, StandardOpenOption.READ);
     List<FileRangeImpl> ranges = new ArrayList<>();
     Joiner joiner = new Joiner(100);
-    final int SIZE = 64 * 1024;
     for(int m=0; m < 100; ++m) {
-      ByteBuffer buffer = bufferChoice.allocate.apply(SIZE);
-      FileRangeCallback range = new FileRangeCallback(channel, m * 1024L * 1024,
-          SIZE, joiner, buffer);
+      ByteBuffer buffer = bufferChoice.allocate.apply(READ_SIZE);
+      FileRangeCallback range = new FileRangeCallback(channel, m * SEEK_SIZE,
+          READ_SIZE, joiner, buffer);
       ranges.add(range);
       channel.read(buffer, range.getOffset(), range, range);
     }
@@ -218,8 +220,8 @@ public void syncRead(FileSystemChoice fsChoice,
     FSDataInputStream stream = fsChoice.fs.open(DATA_PATH);
     List<byte[]> result = new ArrayList<>();
     for(int m=0; m < 100; ++m) {
-      byte[] buffer = new byte[64 * 1024];
-      stream.readFully(m * 1024L * 1024, buffer);
+      byte[] buffer = new byte[READ_SIZE];
+      stream.readFully(m * SEEK_SIZE, buffer);
       result.add(buffer);
     }
     blackhole.consume(result);

From fa3ebc1f53e6ca3d950e750de08a8ca0542ba69d Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Tue, 23 Nov 2021 12:54:15 +0530
Subject: [PATCH 07/13] Merging of ranges in S3A vectored read implementation

---
 .../AbstractContractVectoredReadTest.java     | 10 ++-
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 64 ++++++++++++++++++-
 .../src/test/resources/log4j.properties       |  2 +-
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
index df4ad2c061836..8956318ff1f21 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -82,12 +82,10 @@ public void testVectoredReadMultipleRanges() throws Exception {
         CompletableFuture<ByteBuffer> data = res.getData();
         try {
           ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
-          LOG.info("Returned data from offset {} : {} ", res.getOffset(),
-                  Arrays.toString(buffer.array()));
-          //assertDatasetEquals((int) res.getOffset(), "readAsync", buffer, res.getLength());
+          assertDatasetEquals((int) res.getOffset(), "readAsync", buffer, res.getLength());
         } catch (Exception ex) {
           LOG.error("Exception while running vectored read ", ex);
-          //Assert.fail("Exception while running vectored read " + ex);
+          Assert.fail("Exception while running vectored read " + ex);
         }
       }
     }
@@ -104,7 +102,7 @@ public void testOverlappingRanges() throws Exception {
       for (FileRange res : fileRanges) {
         CompletableFuture<ByteBuffer> data = res.getData();
         try {
-          ByteBuffer buffer = data.get();
+          ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
           assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength());
         } catch (Exception ex) {
           LOG.error("Exception while running vectored read ", ex);
@@ -187,7 +185,7 @@ private void assertDatasetEquals(
       int o = readOffset + i;
       assertEquals(operation + " with read offset " + readOffset
                       + ": data[" + i + "] != DATASET[" + o + "]",
-              DATASET[o], data.get(i));
+              DATASET[o], data.get());
     }
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index dd03ce7f44159..b0ead37b81b5b 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -26,6 +26,8 @@
 import org.apache.hadoop.classification.VisibleForTesting;
 import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.fs.FileRange;
+import org.apache.hadoop.fs.impl.CombinedFileRange;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.fs.impl.VectoredReadUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -55,6 +57,9 @@
 import java.util.function.IntFunction;
 
 import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.impl.VectoredReadUtils.isOrderedDisjoint;
+import static org.apache.hadoop.fs.impl.VectoredReadUtils.sliceTo;
+import static org.apache.hadoop.fs.impl.VectoredReadUtils.sortAndMergeRanges;
 import static org.apache.hadoop.util.StringUtils.toLowerCase;
 
 /**
@@ -803,16 +808,69 @@ public void readFully(long position, byte[] buffer, int offset, int length)
     }
   }
 
+  /**
+   * {@inheritDoc}
+   * Vectored read implementation for S3AInputStream.
+   * @param ranges the byte ranges to read.
+   * @param allocate the function to allocate ByteBuffer.
+   * @throws IOException IOE if any.
+   */
   @Override
   public void readVectored(List<? extends FileRange> ranges,
                            IntFunction<ByteBuffer> allocate) throws IOException {
+    // TODO   Cancelling of vectored read calls.
+    // No upfront cancelling is supported right now but all runnable
+    // tasks will be aborted when threadpool will shutdown during S3AFS.close();
+    // TODO   boundedThreadPool corner cases like rejections etc.
+    // Do we need to think about rejections or we can just use
+    // unbounded thread pool or even create a separate thread pool
+    // for vectored read api such that its bad usage doesn't cause
+    // contention for other api's like list, delete etc.
     checkNotClosed();
     for (FileRange range : ranges) {
       validateRangeRequest(range);
       CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
       range.setData(result);
-      boundedThreadPool.submit(() -> readSingleRange(range, allocate));
     }
+
+    if (isOrderedDisjoint(ranges, 1, minSeekForVectorReads())) {
+      for(FileRange range: ranges) {
+        boundedThreadPool.submit(() -> readSingleRange(range, allocate));
+      }
+    } else {
+      for(CombinedFileRange combinedFileRange: sortAndMergeRanges(ranges, 1, minSeekForVectorReads(),
+              maxReadSizeForVectorReads())) {
+        CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
+        combinedFileRange.setData(result);
+        boundedThreadPool.submit(
+                () -> readCombinedRangeAndUpdateChildren(combinedFileRange, allocate));
+      }
+    }
+  }
+
+  private void readCombinedRangeAndUpdateChildren(CombinedFileRange combinedFileRange,
+                                                  IntFunction<ByteBuffer> allocate) {
+    readSingleRange(combinedFileRange, allocate);
+    try {
+      ByteBuffer combinedBuffer = FutureIOSupport.awaitFuture(combinedFileRange.getData());
+      for(FileRange child : combinedFileRange.getUnderlying()) {
+        updateOriginalRange(child, combinedBuffer, combinedFileRange);
+      }
+    } catch (IOException ex) {
+      LOG.error("Exception occurred while reading combined range ", ex);
+      for(FileRange child : combinedFileRange.getUnderlying()) {
+        child.getData().completeExceptionally(ex);
+      }
+    }
+  }
+  private void updateOriginalRange(FileRange child,
+                                   ByteBuffer combinedBuffer,
+                                   CombinedFileRange combinedFileRange) {
+    LOG.trace("Start Filling original range [{}, {}) from combined range [{}, {}) ",
+            child.getOffset(), child.getLength(), combinedFileRange.getOffset(), combinedFileRange.getLength());
+    ByteBuffer childBuffer = sliceTo(combinedBuffer, combinedFileRange.getOffset(), child);
+    child.getData().complete(childBuffer);
+    LOG.trace("End Filling original range ");
   }
 
   /**
@@ -839,8 +897,8 @@ private void readSingleRange(FileRange range, IntFunction<ByteBuffer> allocate)
       ByteBuffer buffer = allocate.apply(length);
       final GetObjectRequest request = client.newGetRequest(key)
               .withRange(position, position + range.getLength() - 1);
-      String text = String.format("%s %s at %d",
-              "readAsync", uri, position);
+      String text = String.format("%s %s at %d length %d",
+              "readAsync", uri, position, length);
       S3Object object = Invoker.once(text, uri,
               () -> client.getObject(request));
       S3ObjectInputStream objectContent = object.getObjectContent();
diff --git a/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties b/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties
index 653c8a75e69ff..bd583abdaddf6 100644
--- a/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties
+++ b/hadoop-tools/hadoop-aws/src/test/resources/log4j.properties
@@ -52,7 +52,7 @@ log4j.logger.org.apache.hadoop.ipc.Server=WARN
 
 # for debugging low level S3a operations, uncomment these lines
 # Log all S3A classes
-#log4j.logger.org.apache.hadoop.fs.s3a=DEBUG
+log4j.logger.org.apache.hadoop.fs.s3a=DEBUG
 #log4j.logger.org.apache.hadoop.fs.s3a.S3AUtils=INFO
 #log4j.logger.org.apache.hadoop.fs.s3a.Listing=INFO
 # Log S3Guard classes

From f5180f31f1fd32ece2ba080d0c9c6559d27b2c73 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Mon, 6 Dec 2021 12:00:16 +0530
Subject: [PATCH 08/13] Implementing change detection for vectored reads in S3a

---
 .../markdown/filesystem/fsdatainputstream.md  |  2 +-
 .../AbstractContractVectoredReadTest.java     | 33 ++++++-
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 93 ++++++++++++++-----
 3 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
index 42897ee901bc0..02d6765bd4c53 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
@@ -443,7 +443,7 @@ The semantics of this are exactly equivalent to
 That is, the buffer is filled entirely with the contents of the input source
 from position `position`
 
-### `default void readAsync(List<? extends FileRange> ranges, IntFunction<ByteBuffer> allocate)`
+### `default void readVectored(List<? extends FileRange> ranges, IntFunction<ByteBuffer> allocate)`
 
 Read fully data for a list of ranges asynchronously. The default implementation
 iterates through the ranges, tries to coalesce the ranges based on values of
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
index 8956318ff1f21..ff6443a96ea2e 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -36,9 +36,9 @@ public abstract class AbstractContractVectoredReadTest extends AbstractFSContrac
   private static final byte[] DATASET = ContractTestUtils.dataset(DATASET_LEN, 'a', 32);
   private static final String VECTORED_READ_FILE_NAME = "vectored_file.txt";
 
-  private IntFunction<ByteBuffer> allocate;
+  private final IntFunction<ByteBuffer> allocate;
 
-  private String bufferType;
+  private final String bufferType;
 
   @Parameterized.Parameters
   public static List<String> params() {
@@ -59,6 +59,20 @@ public void setup() throws Exception {
     createFile(fs, path, true, DATASET);
   }
 
+  /**
+   * TODO: Improve coverage here by adding :
+   * 1. Some disjoint along with combinable ranges.
+   * 2. Non overlapping ranges.
+   * 3. Overlapping + Non overlapping ranges.
+   * 4. Limit buffer allocation.
+   * 5. Trying to exhaust thread pool -> maybe scale tests.
+   * 6. testNormalReadAfterVectoredRead
+   * 7. testVectoredReadAfterNormalRead
+   * 8. testMultipleVectoredReads
+   * 9. test allocate capacity limit errors.
+   *
+   * @throws Exception
+   */
   @Test
   public void testVectoredReadMultipleRanges() throws Exception {
     describe("Running with buffer type : " + bufferType);
@@ -151,6 +165,21 @@ public void testNegativeOffsetRange()  throws Exception {
     testExceptionalVectoredRead(fs, fileRanges, "Exception is expected");
   }
 
+  @Test
+  public void testNormalReadAfterVectoredRead() throws Exception {
+
+  }
+
+  @Test
+  public void testVectoredReadAfterNormalRead() throws Exception {
+
+  }
+
+  @Test
+  public void testMultipleVectoredReads() throws Exception {
+
+  }
+
   protected void testExceptionalVectoredRead(FileSystem fs,
                                              List<FileRange> fileRanges,
                                              String s) throws IOException {
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index b0ead37b81b5b..79d2b93deb14e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -835,22 +835,24 @@ public void readVectored(List<? extends FileRange> ranges,
 
     if (isOrderedDisjoint(ranges, 1, minSeekForVectorReads())) {
       for(FileRange range: ranges) {
-        boundedThreadPool.submit(() -> readSingleRange(range, allocate));
+        ByteBuffer buffer = allocate.apply(range.getLength());
+        boundedThreadPool.submit(() -> readSingleRange(range, buffer));
       }
     } else {
       for(CombinedFileRange combinedFileRange: sortAndMergeRanges(ranges, 1, minSeekForVectorReads(),
               maxReadSizeForVectorReads())) {
         CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
+        ByteBuffer buffer = allocate.apply(combinedFileRange.getLength());
         combinedFileRange.setData(result);
         boundedThreadPool.submit(
-                () -> readCombinedRangeAndUpdateChildren(combinedFileRange, allocate));
+                () -> readCombinedRangeAndUpdateChildren(combinedFileRange, buffer));
       }
     }
   }
 
   private void readCombinedRangeAndUpdateChildren(CombinedFileRange combinedFileRange,
-                                                  IntFunction<ByteBuffer> allocate) {
-    readSingleRange(combinedFileRange, allocate);
+                                                  ByteBuffer buffer) {
+    readSingleRange(combinedFileRange, buffer);
     try {
       ByteBuffer combinedBuffer = FutureIOSupport.awaitFuture(combinedFileRange.getData());
       for(FileRange child : combinedFileRange.getUnderlying()) {
@@ -886,30 +888,22 @@ private void validateRangeRequest(FileRange range) throws EOFException {
   }
 
   /**
-   * Add retry in client.getObject().
+   * Add retry in client.getObject(). not present in older reads why here??
+   * Okay retry is being done in the top layer during read.
+   * But if we do here in the top layer, one issue I am thinking is
+   * what if there is some error which happened during filling the buffer
+   * If we retry that old offsets of heap buffers can be overwritten ?
    * @param range
-   * @param allocate
+   * @param buffer
    */
-  private void readSingleRange(FileRange range, IntFunction<ByteBuffer> allocate) {
+  private void readSingleRange(FileRange range, ByteBuffer buffer) {
     try {
       long position = range.getOffset();
       int length = range.getLength();
-      ByteBuffer buffer = allocate.apply(length);
-      final GetObjectRequest request = client.newGetRequest(key)
-              .withRange(position, position + range.getLength() - 1);
-      String text = String.format("%s %s at %d length %d",
-              "readAsync", uri, position, length);
-      S3Object object = Invoker.once(text, uri,
-              () -> client.getObject(request));
-      S3ObjectInputStream objectContent = object.getObjectContent();
-      if (buffer.isDirect()) {
-        byte[] tmp = new byte[length];
-        objectContent.read(tmp, 0, length);
-        buffer.put(tmp);
-        buffer.flip();
-      } else {
-        objectContent.read(buffer.array(), buffer.arrayOffset(), range.getLength());
-      }
+      final String operationName = "readVectored";
+      S3Object objectRange = getS3Object(operationName, position, length);
+      S3ObjectInputStream objectContent = objectRange.getObjectContent();
+      populateBuffer(length, buffer, objectContent);
       range.getData().complete(buffer);
     } catch (IOException ex) {
       LOG.error("Exception while reading a range {} ", range, ex);
@@ -917,6 +911,59 @@ private void readSingleRange(FileRange range, IntFunction<ByteBuffer> allocate)
     }
   }
 
+  /**
+   * Populates the buffer with data from objectContent
+   * till length. Handles both direct and heap byte buffers.
+   * @param length
+   * @param buffer
+   * @param objectContent
+   * @throws IOException
+   */
+  private void populateBuffer(int length,
+                              ByteBuffer buffer,
+                              S3ObjectInputStream objectContent) throws IOException {
+    if (buffer.isDirect()) {
+      byte[] tmp = new byte[length];
+      objectContent.read(tmp, 0, length);
+      buffer.put(tmp);
+      buffer.flip();
+    } else {
+      objectContent.read(buffer.array(), buffer.arrayOffset(), length);
+    }
+  }
+
+  /**
+   * Read data from S3 using a http request.
+   *
+   * @param operationName
+   * @param position
+   * @param length
+   * @return
+   * @throws IOException
+   */
+  private S3Object getS3Object(String operationName, long position,
+                               int length) throws IOException {
+    final GetObjectRequest request = client.newGetRequest(key)
+            .withRange(position, position + length - 1);
+    String text = String.format("%s %s at %d length %d",
+            operationName, uri, position, length);
+    changeTracker.maybeApplyConstraint(request);
+    DurationTracker tracker = streamStatistics.initiateGetRequest();
+    S3Object objectRange;
+    try {
+      objectRange = Invoker.once(text, uri,
+              () -> client.getObject(request));
+    } catch (IOException ex) {
+      tracker.failed();
+      throw ex;
+    } finally {
+      tracker.close();
+
+    }
+    changeTracker.processResponse(objectRange, operationName,
+            position);
+    return objectRange;
+  }
 
   /**
    * Access the input stream statistics.

From e084602f0a5c41b9b770155835e3502eb0730917 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Tue, 7 Dec 2021 14:31:25 +0530
Subject: [PATCH 09/13] More tests and javadoc

---
 .../AbstractContractVectoredReadTest.java     | 79 +++++++++++++------
 .../hadoop/fs/impl/TestVectoredReadUtils.java | 26 ++++++
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   |  2 +-
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 53 +++++++++----
 4 files changed, 120 insertions(+), 40 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
index ff6443a96ea2e..90961ac42a58e 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -92,16 +92,7 @@ public void testVectoredReadMultipleRanges() throws Exception {
       CompletableFuture<Void> combinedFuture = CompletableFuture.allOf(completableFutures);
       combinedFuture.get();
 
-      for (FileRange res : fileRanges) {
-        CompletableFuture<ByteBuffer> data = res.getData();
-        try {
-          ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
-          assertDatasetEquals((int) res.getOffset(), "readAsync", buffer, res.getLength());
-        } catch (Exception ex) {
-          LOG.error("Exception while running vectored read ", ex);
-          Assert.fail("Exception while running vectored read " + ex);
-        }
-      }
+      validateVectoredReadResult(fileRanges);
     }
   }
 
@@ -113,16 +104,7 @@ public void testOverlappingRanges() throws Exception {
     fileRanges.add(new FileRangeImpl(90, 50));
     try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
       in.readVectored(fileRanges, allocate);
-      for (FileRange res : fileRanges) {
-        CompletableFuture<ByteBuffer> data = res.getData();
-        try {
-          ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
-          assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength());
-        } catch (Exception ex) {
-          LOG.error("Exception while running vectored read ", ex);
-          Assert.fail("Exception while running vectored read " + ex);
-        }
-      }
+      validateVectoredReadResult(fileRanges);
     }
   }
 
@@ -167,17 +149,70 @@ public void testNegativeOffsetRange()  throws Exception {
 
   @Test
   public void testNormalReadAfterVectoredRead() throws Exception {
-
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = createSomeOverlappingRanges();
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      // read starting 200 bytes
+      byte[] res = new byte[200];
+      in.read(res, 0, 200);
+      ByteBuffer buffer = ByteBuffer.wrap(res);
+      assertDatasetEquals(0, "normal_read", buffer, 200);
+      Assertions.assertThat(in.getPos())
+              .describedAs("Vectored read shouldn't change file pointer.")
+              .isEqualTo(200);
+      validateVectoredReadResult(fileRanges);
+    }
   }
 
   @Test
   public void testVectoredReadAfterNormalRead() throws Exception {
-
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = createSomeOverlappingRanges();
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      // read starting 200 bytes
+      byte[] res = new byte[200];
+      in.read(res, 0, 200);
+      ByteBuffer buffer = ByteBuffer.wrap(res);
+      assertDatasetEquals(0, "normal_read", buffer, 200);
+      Assertions.assertThat(in.getPos())
+              .describedAs("Vectored read shouldn't change file pointer.")
+              .isEqualTo(200);
+      in.readVectored(fileRanges, allocate);
+      validateVectoredReadResult(fileRanges);
+    }
   }
 
   @Test
   public void testMultipleVectoredReads() throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges1 = createSomeOverlappingRanges();
+    List<FileRange> fileRanges2 = createSomeOverlappingRanges();
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges1, allocate);
+      in.readVectored(fileRanges2, allocate);
+      validateVectoredReadResult(fileRanges2);
+      validateVectoredReadResult(fileRanges1);
+    }
+  }
 
+  protected List<FileRange> createSomeOverlappingRanges() {
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(0, 100));
+    fileRanges.add(new FileRangeImpl(90, 50));
+    return fileRanges;
+  }
+  protected void validateVectoredReadResult(List<FileRange> fileRanges) {
+    for (FileRange res : fileRanges) {
+      CompletableFuture<ByteBuffer> data = res.getData();
+      try {
+        ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
+        assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength());
+      } catch (Exception ex) {
+        LOG.error("Exception while running vectored read ", ex);
+        Assert.fail("Exception while running vectored read " + ex);
+      }
+    }
   }
 
   protected void testExceptionalVectoredRead(FileSystem fs,
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
index bc32e1033c3f1..01605f9ee53fd 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/impl/TestVectoredReadUtils.java
@@ -179,6 +179,32 @@ public void testSortAndMerge() {
     assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 16, 700));
   }
 
+  @Test
+  public void testSortAndMergeMoreCases() throws Exception {
+    List<FileRange> input = Arrays.asList(
+            new FileRangeImpl(3000, 110),
+            new FileRangeImpl(3000, 100),
+            new FileRangeImpl(2100, 100),
+            new FileRangeImpl(1000, 100)
+    );
+    assertFalse(VectoredReadUtils.isOrderedDisjoint(input, 100, 800));
+    List<CombinedFileRange> outputList = VectoredReadUtils.sortAndMergeRanges(
+            input, 1, 1001, 2500);
+    assertEquals(1, outputList.size());
+    CombinedFileRange output = outputList.get(0);
+    assertEquals(4, output.getUnderlying().size());
+    assertEquals("range[1000,3110)", output.toString());
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 1, 800));
+
+    outputList = VectoredReadUtils.sortAndMergeRanges(
+            input, 100, 1001, 2500);
+    assertEquals(1, outputList.size());
+    output = outputList.get(0);
+    assertEquals(4, output.getUnderlying().size());
+    assertEquals("range[1000,3200)", output.toString());
+    assertTrue(VectoredReadUtils.isOrderedDisjoint(outputList, 1, 800));
+
+  }
   interface Stream extends PositionedReadable, ByteBufferPositionedReadable {
     // nothing
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 5fbdc4e03b69c..35ce29483f407 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -1500,7 +1500,7 @@ private FSDataInputStream open(
             readContext,
             createObjectAttributes(fileStatus),
             createInputStreamCallbacks(auditSpan),
-            boundedThreadPool));
+            unboundedThreadPool));
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 79d2b93deb14e..83333c4bcb95e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -54,6 +54,7 @@
 import java.util.List;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
+import java.util.concurrent.ThreadPoolExecutor;
 import java.util.function.IntFunction;
 
 import static org.apache.commons.lang3.StringUtils.isNotEmpty;
@@ -112,7 +113,7 @@ public class S3AInputStream extends FSInputStream implements  CanSetReadahead,
   private S3ObjectInputStream wrappedStream;
   private final S3AReadOpContext context;
   private final InputStreamCallbacks client;
-  private final ExecutorService boundedThreadPool;
+  private final ThreadPoolExecutor unboundedThreadPool;
   private final String bucket;
   private final String key;
   private final String pathStr;
@@ -160,7 +161,7 @@ public class S3AInputStream extends FSInputStream implements  CanSetReadahead,
   public S3AInputStream(S3AReadOpContext ctx,
                         S3ObjectAttributes s3Attributes,
                         InputStreamCallbacks client,
-                        ExecutorService threadPool) {
+                        ThreadPoolExecutor unboundedThreadPool) {
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getBucket()),
         "No Bucket");
     Preconditions.checkArgument(isNotEmpty(s3Attributes.getKey()), "No Key");
@@ -182,7 +183,7 @@ public S3AInputStream(S3AReadOpContext ctx,
         s3Attributes);
     setInputPolicy(ctx.getInputPolicy());
     setReadahead(ctx.getReadahead());
-    this.boundedThreadPool = threadPool;
+    this.unboundedThreadPool = unboundedThreadPool;
   }
 
   /**
@@ -821,11 +822,11 @@ public void readVectored(List<? extends FileRange> ranges,
     // TODO   Cancelling of vectored read calls.
     // No upfront cancelling is supported right now but all runnable
     // tasks will be aborted when threadpool will shutdown during S3AFS.close();
-    // TODO   boundedThreadPool corner cases like rejections etc.
-    // Do we need to think about rejections or we can just use
-    // unbounded thread pool or even create a separate thread pool
+    // TODO   unbounded corner cases like starvation etc.
+    // think of creating a separate thread pool
     // for vectored read api such that its bad usage doesn't cause
-    // contention for other api's like list, delete etc.
+    // starvation for other api's like list, delete etc.
+    // TODO: what if combined range becomes so big that memory can't be allocated.
     checkNotClosed();
     for (FileRange range : ranges) {
       validateRangeRequest(range);
@@ -836,7 +837,7 @@ public void readVectored(List<? extends FileRange> ranges,
     if (isOrderedDisjoint(ranges, 1, minSeekForVectorReads())) {
       for(FileRange range: ranges) {
         ByteBuffer buffer = allocate.apply(range.getLength());
-        boundedThreadPool.submit(() -> readSingleRange(range, buffer));
+        unboundedThreadPool.submit(() -> readSingleRange(range, buffer));
       }
     } else {
       for(CombinedFileRange combinedFileRange: sortAndMergeRanges(ranges, 1, minSeekForVectorReads(),
@@ -844,12 +845,18 @@ public void readVectored(List<? extends FileRange> ranges,
         CompletableFuture<ByteBuffer> result = new CompletableFuture<>();
         ByteBuffer buffer = allocate.apply(combinedFileRange.getLength());
         combinedFileRange.setData(result);
-        boundedThreadPool.submit(
+        unboundedThreadPool.submit(
                 () -> readCombinedRangeAndUpdateChildren(combinedFileRange, buffer));
       }
     }
   }
 
+  /**
+   * Read data in the combinedFileRange and update data in buffers
+   * of all underlying ranges.
+   * @param combinedFileRange combined range.
+   * @param buffer combined buffer.
+   */
   private void readCombinedRangeAndUpdateChildren(CombinedFileRange combinedFileRange,
                                                   ByteBuffer buffer) {
     readSingleRange(combinedFileRange, buffer);
@@ -865,6 +872,13 @@ private void readCombinedRangeAndUpdateChildren(CombinedFileRange combinedFileRa
       }
     }
   }
+
+  /**
+   * Update data in child range from combined range.
+   * @param child child range.
+   * @param combinedBuffer combined buffer.
+   * @param combinedFileRange
+   */
   private void updateOriginalRange(FileRange child,
                                    ByteBuffer combinedBuffer,
                                    CombinedFileRange combinedFileRange) {
@@ -888,13 +902,15 @@ private void validateRangeRequest(FileRange range) throws EOFException {
   }
 
   /**
-   * Add retry in client.getObject(). not present in older reads why here??
+   * TODO: Add retry in client.getObject(). not present in older reads why here??
    * Okay retry is being done in the top layer during read.
    * But if we do here in the top layer, one issue I am thinking is
    * what if there is some error which happened during filling the buffer
    * If we retry that old offsets of heap buffers can be overwritten ?
-   * @param range
-   * @param buffer
+   * I think retry should be only added in {@link S3AInputStream#getS3Object}
+   * Read data from S3 for this range and populate the bufffer.
+   * @param range range of data to read.
+   * @param buffer buffer to fill.
    */
   private void readSingleRange(FileRange range, ByteBuffer buffer) {
     try {
@@ -914,10 +930,10 @@ private void readSingleRange(FileRange range, ByteBuffer buffer) {
   /**
    * Populates the buffer with data from objectContent
    * till length. Handles both direct and heap byte buffers.
-   * @param length
-   * @param buffer
-   * @param objectContent
-   * @throws IOException
+   * @param length length of data to populate.
+   * @param buffer buffer to fill.
+   * @param objectContent result retrieved from S3 store.
+   * @throws IOException any IOE.
    */
   private void populateBuffer(int length,
                               ByteBuffer buffer,
@@ -934,7 +950,10 @@ private void populateBuffer(int length,
 
   /**
    * Read data from S3 using a http request.
-   *
+   * This also handles if file has been changed while http call
+   * is getting executed. If file has been changed RemoteFileChangedException
+   * is thrown.
+   * TODO: add retries here.
    * @param operationName
    * @param position
    * @param length

From 9fd804c8cdd5c47e4b026b7cea86bdae98a90aa8 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Thu, 23 Dec 2021 15:31:18 +0530
Subject: [PATCH 10/13] Cleaning the s3objets else connections were getting
 exhausted

---
 .../AbstractContractVectoredReadTest.java        | 16 ++++++++++++++++
 .../org/apache/hadoop/fs/s3a/S3AInputStream.java |  9 +++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
index 90961ac42a58e..2ff0dfa3c4af7 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -96,6 +96,22 @@ public void testVectoredReadMultipleRanges() throws Exception {
     }
   }
 
+  @Test
+  public void testVectoredReadAndReadFully()  throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(100, 100));
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      byte[] readFullRes = new byte[100];
+      in.readFully(100, readFullRes);
+      ByteBuffer vecRes = FutureIOSupport.awaitFuture(fileRanges.get(0).getData());
+      Assertions.assertThat(vecRes)
+              .describedAs("Result from vectored read and readFully must match")
+              .isEqualByComparingTo(ByteBuffer.wrap(readFullRes));
+    }
+  }
+
   @Test
   public void testOverlappingRanges() throws Exception {
     FileSystem fs = getFileSystem();
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 83333c4bcb95e..953a0bd2d128e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -24,6 +24,7 @@
 import com.amazonaws.services.s3.model.S3Object;
 import com.amazonaws.services.s3.model.S3ObjectInputStream;
 import org.apache.hadoop.classification.VisibleForTesting;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.util.Preconditions;
 import org.apache.hadoop.fs.FileRange;
 import org.apache.hadoop.fs.impl.CombinedFileRange;
@@ -913,17 +914,21 @@ private void validateRangeRequest(FileRange range) throws EOFException {
    * @param buffer buffer to fill.
    */
   private void readSingleRange(FileRange range, ByteBuffer buffer) {
+    S3Object objectRange = null;
+    S3ObjectInputStream objectContent = null;
     try {
       long position = range.getOffset();
       int length = range.getLength();
       final String operationName = "readVectored";
-      S3Object objectRange = getS3Object(operationName, position, length);
-      S3ObjectInputStream objectContent = objectRange.getObjectContent();
+      objectRange = getS3Object(operationName, position, length);
+      objectContent = objectRange.getObjectContent();
       populateBuffer(length, buffer, objectContent);
       range.getData().complete(buffer);
     } catch (IOException ex) {
       LOG.error("Exception while reading a range {} ", range, ex);
       range.getData().completeExceptionally(ex);
+    } finally {
+      IOUtils.cleanupWithLogger(LOG, objectRange, objectContent);
     }
   }
 

From b78dded39a52c7c63600f4b0d0ae975702a18c55 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Thu, 6 Jan 2022 13:58:20 +0530
Subject: [PATCH 11/13] Adding retries in getS3Object

---
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 953a0bd2d128e..9540e3d893c2e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -878,7 +878,7 @@ private void readCombinedRangeAndUpdateChildren(CombinedFileRange combinedFileRa
    * Update data in child range from combined range.
    * @param child child range.
    * @param combinedBuffer combined buffer.
-   * @param combinedFileRange
+   * @param combinedFileRange combined range.
    */
   private void updateOriginalRange(FileRange child,
                                    ByteBuffer combinedBuffer,
@@ -887,18 +887,22 @@ private void updateOriginalRange(FileRange child,
             child.getOffset(), child.getLength(), combinedFileRange.getOffset(), combinedFileRange.getLength());
     ByteBuffer childBuffer = sliceTo(combinedBuffer, combinedFileRange.getOffset(), child);
     child.getData().complete(childBuffer);
-    LOG.trace("End Filling original range ");
+    LOG.trace("End Filling original range [{}, {}) from combined range [{}, {}) ",
+            child.getOffset(), child.getLength(), combinedFileRange.getOffset(), combinedFileRange.getLength());
   }
 
   /**
    * Validates range parameters.
-   * @param range
-   * @throws EOFException
+   * @param range requested range.
+   * @throws EOFException end of file exception.
    */
   private void validateRangeRequest(FileRange range) throws EOFException {
     VectoredReadUtils.validateRangeRequest(range);
     if(range.getOffset() + range.getLength() > contentLength) {
-      throw new EOFException("Requested range is beyond EOF");
+      LOG.error("Requested range [{}, {}) is beyond EOF",
+              range.getOffset(), range.getLength());
+      throw new EOFException("Requested range [" + range.getOffset() +", "
+              + range.getLength() + ") is beyond EOF");
     }
   }
 
@@ -914,6 +918,7 @@ private void validateRangeRequest(FileRange range) throws EOFException {
    * @param buffer buffer to fill.
    */
   private void readSingleRange(FileRange range, ByteBuffer buffer) {
+    LOG.debug("Start reading range {} ", range);
     S3Object objectRange = null;
     S3ObjectInputStream objectContent = null;
     try {
@@ -922,6 +927,10 @@ private void readSingleRange(FileRange range, ByteBuffer buffer) {
       final String operationName = "readVectored";
       objectRange = getS3Object(operationName, position, length);
       objectContent = objectRange.getObjectContent();
+      if (objectContent == null) {
+        throw new PathIOException(uri,
+                "Null IO stream received during " + operationName);
+      }
       populateBuffer(length, buffer, objectContent);
       range.getData().complete(buffer);
     } catch (IOException ex) {
@@ -930,6 +939,7 @@ private void readSingleRange(FileRange range, ByteBuffer buffer) {
     } finally {
       IOUtils.cleanupWithLogger(LOG, objectRange, objectContent);
     }
+    LOG.debug("Finished reading range {} ", range);
   }
 
   /**
@@ -958,31 +968,28 @@ private void populateBuffer(int length,
    * This also handles if file has been changed while http call
    * is getting executed. If file has been changed RemoteFileChangedException
    * is thrown.
-   * TODO: add retries here.
-   * @param operationName
-   * @param position
-   * @param length
-   * @return
-   * @throws IOException
+   * @param operationName name of the operation for which get object on S3 is called.
+   * @param position position of the object to be read from S3.
+   * @param length length from position of the object to be read from S3.
+   * @return S3Object
+   * @throws IOException exception if any.
    */
   private S3Object getS3Object(String operationName, long position,
                                int length) throws IOException {
     final GetObjectRequest request = client.newGetRequest(key)
             .withRange(position, position + length - 1);
-    String text = String.format("%s %s at %d length %d",
-            operationName, uri, position, length);
     changeTracker.maybeApplyConstraint(request);
     DurationTracker tracker = streamStatistics.initiateGetRequest();
     S3Object objectRange;
+    Invoker invoker = context.getReadInvoker();
     try {
-      objectRange = Invoker.once(text, uri,
+      objectRange = invoker.retry(operationName, uri, true,
               () -> client.getObject(request));
     } catch (IOException ex) {
       tracker.failed();
       throw ex;
     } finally {
       tracker.close();
-
     }
     changeTracker.processResponse(objectRange, operationName,
             position);

From 38dcd3907384a2bb9300ecc05ea7a8f4f25554c0 Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Thu, 6 Jan 2022 14:34:53 +0530
Subject: [PATCH 12/13] Fix vectored read for bigger size files

---
 .../apache/hadoop/fs/s3a/S3AInputStream.java  | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 9540e3d893c2e..37de3a90eaaeb 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -955,11 +955,27 @@ private void populateBuffer(int length,
                               S3ObjectInputStream objectContent) throws IOException {
     if (buffer.isDirect()) {
       byte[] tmp = new byte[length];
-      objectContent.read(tmp, 0, length);
+      readByteArray(objectContent, tmp, 0, length);
       buffer.put(tmp);
       buffer.flip();
     } else {
-      objectContent.read(buffer.array(), buffer.arrayOffset(), length);
+      readByteArray(objectContent, buffer.array(), 0, length);
+    }
+  }
+
+  public void readByteArray(S3ObjectInputStream objectContent,
+                            byte[] dest,
+                            int offset,
+                            int length) throws IOException {
+    int readBytes = 0;
+    while ( readBytes < length) {
+      int readBytesCurr = objectContent.read(dest,
+              offset + readBytes,
+              length - readBytes);
+      readBytes +=readBytesCurr;
+      if (readBytesCurr < 0) {
+        throw new EOFException(FSExceptionMessages.EOF_IN_READ_FULLY);
+      }
     }
   }
 

From dd3914bd98a1de70f7d7e6cf01e545122bc3fa8a Mon Sep 17 00:00:00 2001
From: Mukund Thakur <mthakur@cloudera.com>
Date: Thu, 6 Jan 2022 15:55:20 +0530
Subject: [PATCH 13/13] Test for bigger file size

---
 .../AbstractContractVectoredReadTest.java     | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
index 2ff0dfa3c4af7..0485381e06c3b 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java
@@ -35,6 +35,9 @@ public abstract class AbstractContractVectoredReadTest extends AbstractFSContrac
   public static final int DATASET_LEN = 1024;
   private static final byte[] DATASET = ContractTestUtils.dataset(DATASET_LEN, 'a', 32);
   private static final String VECTORED_READ_FILE_NAME = "vectored_file.txt";
+  private static final String VECTORED_READ_FILE_1MB_NAME = "vectored_file_1M.txt";
+  private static final byte[] DATASET_MB = ContractTestUtils.dataset(1024 * 1024, 'a', 256);
+
 
   private final IntFunction<ByteBuffer> allocate;
 
@@ -57,6 +60,8 @@ public void setup() throws Exception {
     Path path = path(VECTORED_READ_FILE_NAME);
     FileSystem fs = getFileSystem();
     createFile(fs, path, true, DATASET);
+    Path bigFile = path(VECTORED_READ_FILE_1MB_NAME);
+    createFile(fs, bigFile, true, DATASET_MB);
   }
 
   /**
@@ -112,6 +117,21 @@ public void testVectoredReadAndReadFully()  throws Exception {
     }
   }
 
+
+  @Test
+  public void testVectoredReadBigFile()  throws Exception {
+    FileSystem fs = getFileSystem();
+    List<FileRange> fileRanges = new ArrayList<>();
+    fileRanges.add(new FileRangeImpl(1293, 25837));
+    try (FSDataInputStream in = fs.open(path(VECTORED_READ_FILE_1MB_NAME))) {
+      in.readVectored(fileRanges, allocate);
+      ByteBuffer vecRes = FutureIOSupport.awaitFuture(fileRanges.get(0).getData());
+      FileRange resRange = fileRanges.get(0);
+      assertDatasetEquals((int) resRange.getOffset(), "vecRead",
+              vecRes, resRange.getLength(), DATASET_MB);
+    }
+  }
+
   @Test
   public void testOverlappingRanges() throws Exception {
     FileSystem fs = getFileSystem();
@@ -173,7 +193,7 @@ public void testNormalReadAfterVectoredRead() throws Exception {
       byte[] res = new byte[200];
       in.read(res, 0, 200);
       ByteBuffer buffer = ByteBuffer.wrap(res);
-      assertDatasetEquals(0, "normal_read", buffer, 200);
+      assertDatasetEquals(0, "normal_read", buffer, 200, DATASET);
       Assertions.assertThat(in.getPos())
               .describedAs("Vectored read shouldn't change file pointer.")
               .isEqualTo(200);
@@ -190,7 +210,7 @@ public void testVectoredReadAfterNormalRead() throws Exception {
       byte[] res = new byte[200];
       in.read(res, 0, 200);
       ByteBuffer buffer = ByteBuffer.wrap(res);
-      assertDatasetEquals(0, "normal_read", buffer, 200);
+      assertDatasetEquals(0, "normal_read", buffer, 200, DATASET);
       Assertions.assertThat(in.getPos())
               .describedAs("Vectored read shouldn't change file pointer.")
               .isEqualTo(200);
@@ -223,7 +243,7 @@ protected void validateVectoredReadResult(List<FileRange> fileRanges) {
       CompletableFuture<ByteBuffer> data = res.getData();
       try {
         ByteBuffer buffer = FutureIOSupport.awaitFuture(data);
-        assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength());
+        assertDatasetEquals((int) res.getOffset(), "vecRead", buffer, res.getLength(), DATASET);
       } catch (Exception ex) {
         LOG.error("Exception while running vectored read ", ex);
         Assert.fail("Exception while running vectored read " + ex);
@@ -251,21 +271,21 @@ protected void testExceptionalVectoredRead(FileSystem fs,
    * Assert that the data read matches the dataset at the given offset.
    * This helps verify that the seek process is moving the read pointer
    * to the correct location in the file.
-   *
-   * @param readOffset the offset in the file where the read began.
+   *  @param readOffset the offset in the file where the read began.
    * @param operation  operation name for the assertion.
    * @param data       data read in.
    * @param length     length of data to check.
+   * @param originalData
    */
   private void assertDatasetEquals(
           final int readOffset, final String operation,
           final ByteBuffer data,
-          int length) {
+          int length, byte[] originalData) {
     for (int i = 0; i < length; i++) {
       int o = readOffset + i;
       assertEquals(operation + " with read offset " + readOffset
                       + ": data[" + i + "] != DATASET[" + o + "]",
-              DATASET[o], data.get());
+              originalData[o], data.get());
     }
   }
 }