From 3b136218f5f6003d83f30da42ad39be168bc1397 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 8 Jun 2023 10:40:18 +0100 Subject: [PATCH 1/7] Better paging when random reads go backwards --- .../lucene/store/BufferedIndexInput.java | 8 ++++---- .../lucene/store/TestBufferedIndexInput.java | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 8db6ba3031cd..7d1c0476f7a8 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -43,7 +43,7 @@ public abstract class BufferedIndexInput extends IndexInput implements RandomAcc /** A buffer size for merges set to {@value #MERGE_BUFFER_SIZE}. */ public static final int MERGE_BUFFER_SIZE = 4096; - private int bufferSize = BUFFER_SIZE; + private final int bufferSize; private ByteBuffer buffer = EMPTY_BYTEBUFFER; @@ -163,11 +163,11 @@ public final long readLong() throws IOException { public final byte readByte(long pos) throws IOException { long index = pos - bufferStart; if (index < 0 || index >= buffer.limit()) { - bufferStart = pos; + bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; buffer.limit(0); // trigger refill() on read - seekInternal(pos); + seekInternal(bufferStart); refill(); - index = 0; + index = pos - bufferStart; } return buffer.get((int) index); } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java index 546a22493975..b17742427082 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java @@ -142,6 +142,16 @@ public void testEOF() throws Exception { }); } + // Test that when reading backwards, we page backwards rather than refilling + // on every call + public void testBackwardsReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + for (int i = 2048; i > 0; i -= random().nextInt(16)) { + assertEquals(byten(i), input.readByte(i)); + } + assertEquals(3, input.readCount); + } + // byten emulates a file - byten(n) returns the n'th byte in that file. // MyBufferedIndexInput reads this "file". private static byte byten(long n) { @@ -150,7 +160,8 @@ private static byte byten(long n) { private static class MyBufferedIndexInput extends BufferedIndexInput { private long pos; - private long len; + private final long len; + private long readCount = 0; public MyBufferedIndexInput(long len) { super("MyBufferedIndexInput(len=" + len + ")", BufferedIndexInput.BUFFER_SIZE); @@ -164,14 +175,15 @@ public MyBufferedIndexInput() { } @Override - protected void readInternal(ByteBuffer b) throws IOException { + protected void readInternal(ByteBuffer b) { + readCount++; while (b.hasRemaining()) { b.put(byten(pos++)); } } @Override - protected void seekInternal(long pos) throws IOException { + protected void seekInternal(long pos) { this.pos = pos; } From 7e8f54a283c0ecd1af06c4000981527e37792858 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 8 Jun 2023 13:35:38 +0100 Subject: [PATCH 2/7] Extend to int, short, long --- .../lucene/store/BufferedIndexInput.java | 41 ++++++++++---- .../lucene/store/TestBufferedIndexInput.java | 56 ++++++++++++++++++- 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 7d1c0476f7a8..a53172e60473 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -72,7 +72,7 @@ public BufferedIndexInput(String resourceDesc, int bufferSize) { this.bufferSize = bufferSize; } - /** Returns buffer size. @see #setBufferSize */ + /** Returns buffer size */ public final int getBufferSize() { return bufferSize; } @@ -163,6 +163,9 @@ public final long readLong() throws IOException { public final byte readByte(long pos) throws IOException { long index = pos - bufferStart; if (index < 0 || index >= buffer.limit()) { + // if we're moving backwards, then try and fill up the previous page rather than + // starting again at the current pos, to avoid successive backwards reads reloading + // the same data over and over again bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; buffer.limit(0); // trigger refill() on read seekInternal(bufferStart); @@ -176,11 +179,17 @@ public final byte readByte(long pos) throws IOException { public final short readShort(long pos) throws IOException { long index = pos - bufferStart; if (index < 0 || index >= buffer.limit() - 1) { - bufferStart = pos; + // if we're moving backwards, then try and fill up the previous page rather than + // starting again at the current pos, to avoid successive backwards reads reloading + // the same data over and over again + bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; + if (bufferStart + bufferSize - pos <= 1) { + bufferStart++; // adjust buffer to include the whole short value at pos + } buffer.limit(0); // trigger refill() on read - seekInternal(pos); + seekInternal(bufferStart); refill(); - index = 0; + index = pos - bufferStart; } return buffer.getShort((int) index); } @@ -189,11 +198,17 @@ public final short readShort(long pos) throws IOException { public final int readInt(long pos) throws IOException { long index = pos - bufferStart; if (index < 0 || index >= buffer.limit() - 3) { - bufferStart = pos; + // if we're moving backwards, then try and fill up the previous page rather than + // starting again at the current pos, to avoid successive backwards reads reloading + // the same data over and over again + bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; + if (bufferStart + bufferSize - pos <= 3) { + bufferStart += 4; // adjust buffer to include the whole long value at pos + } buffer.limit(0); // trigger refill() on read - seekInternal(pos); + seekInternal(bufferStart); refill(); - index = 0; + index = pos - bufferStart; } return buffer.getInt((int) index); } @@ -202,11 +217,17 @@ public final int readInt(long pos) throws IOException { public final long readLong(long pos) throws IOException { long index = pos - bufferStart; if (index < 0 || index >= buffer.limit() - 7) { - bufferStart = pos; + // if we're moving backwards, then try and fill up the previous page rather than + // starting again at the current pos, to avoid successive backwards reads reloading + // the same data over and over again + bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; + if (bufferStart + bufferSize - pos <= 7) { + bufferStart += 8; // adjust buffer to include the whole long value at pos + } buffer.limit(0); // trigger refill() on read - seekInternal(pos); + seekInternal(bufferStart); refill(); - index = 0; + index = pos - bufferStart; } return buffer.getLong((int) index); } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java index b17742427082..105f2544aa0d 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.Random; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.ArrayUtil; @@ -144,7 +145,7 @@ public void testEOF() throws Exception { // Test that when reading backwards, we page backwards rather than refilling // on every call - public void testBackwardsReads() throws IOException { + public void testBackwardsByteReads() throws IOException { MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); for (int i = 2048; i > 0; i -= random().nextInt(16)) { assertEquals(byten(i), input.readByte(i)); @@ -152,6 +153,59 @@ public void testBackwardsReads() throws IOException { assertEquals(3, input.readCount); } + public void testBackwardsShortReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(2); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 1)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + assertEquals(bb.getShort(0), input.readShort(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole short + assertTrue(input.readCount == 4 || input.readCount == 3); + } + + public void testBackwardsIntReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(4); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 3)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + bb.put(byten(i + 2)); + bb.put(byten(i + 3)); + assertEquals(bb.getInt(0), input.readInt(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole int + assertTrue(input.readCount == 4 || input.readCount == 3); + } + + public void testBackwardsLongReads() throws IOException { + MyBufferedIndexInput input = new MyBufferedIndexInput(1024 * 8); + ByteBuffer bb = ByteBuffer.allocate(8); + bb.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 2048; i > 0; i -= (random().nextInt(16) + 7)) { + bb.clear(); + bb.put(byten(i)); + bb.put(byten(i + 1)); + bb.put(byten(i + 2)); + bb.put(byten(i + 3)); + bb.put(byten(i + 4)); + bb.put(byten(i + 5)); + bb.put(byten(i + 6)); + bb.put(byten(i + 7)); + assertEquals(bb.getLong(0), input.readLong(i)); + } + // readCount can be three or four, depending on whether or not we had to adjust the bufferStart + // to include a whole long + assertTrue(input.readCount == 4 || input.readCount == 3); + } + // byten emulates a file - byten(n) returns the n'th byte in that file. // MyBufferedIndexInput reads this "file". private static byte byten(long n) { From 1365b688f0ec1cbe6a17a2542ac7643a55a38cf9 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 8 Jun 2023 14:45:41 +0100 Subject: [PATCH 3/7] fix comment --- .../src/java/org/apache/lucene/store/BufferedIndexInput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index a53172e60473..41bab02229c1 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -203,7 +203,7 @@ public final int readInt(long pos) throws IOException { // the same data over and over again bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; if (bufferStart + bufferSize - pos <= 3) { - bufferStart += 4; // adjust buffer to include the whole long value at pos + bufferStart += 4; // adjust buffer to include the whole int value at pos } buffer.limit(0); // trigger refill() on read seekInternal(bufferStart); From ee40aed5cc5d55947b634e85d5bdbec886eac132 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 9 Jun 2023 10:03:43 +0100 Subject: [PATCH 4/7] deef --- .../lucene/store/BufferedIndexInput.java | 74 ++++++------------- .../lucene/store/TestBufferedIndexInput.java | 9 ++- 2 files changed, 30 insertions(+), 53 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 41bab02229c1..51dd71d49e0c 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -159,76 +159,50 @@ public final long readLong() throws IOException { } } - @Override - public final byte readByte(long pos) throws IOException { + private long maybeRefillBuffer(long pos, int width) throws IOException { long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit()) { + if (index >= 0 && index < buffer.limit() - width + 1) { + return index; + } + if (index < 0) { // if we're moving backwards, then try and fill up the previous page rather than // starting again at the current pos, to avoid successive backwards reads reloading // the same data over and over again - bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; - buffer.limit(0); // trigger refill() on read - seekInternal(bufferStart); - refill(); - index = pos - bufferStart; + bufferStart = Math.min(pos, Math.max(0, bufferStart - bufferSize)); + if (bufferStart + bufferSize - pos < width) { + bufferStart += width - 1; // make sure we don't read over the end of the buffer + } + } else { + // we're moving forwards, reset the buffer to start at pos + bufferStart = pos; } + buffer.limit(0); // trigger refill() on read + seekInternal(bufferStart); + refill(); + return pos - bufferStart; + } + + @Override + public final byte readByte(long pos) throws IOException { + long index = maybeRefillBuffer(pos, Byte.BYTES); return buffer.get((int) index); } @Override public final short readShort(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 1) { - // if we're moving backwards, then try and fill up the previous page rather than - // starting again at the current pos, to avoid successive backwards reads reloading - // the same data over and over again - bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; - if (bufferStart + bufferSize - pos <= 1) { - bufferStart++; // adjust buffer to include the whole short value at pos - } - buffer.limit(0); // trigger refill() on read - seekInternal(bufferStart); - refill(); - index = pos - bufferStart; - } + long index = maybeRefillBuffer(pos, Short.BYTES); return buffer.getShort((int) index); } @Override public final int readInt(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 3) { - // if we're moving backwards, then try and fill up the previous page rather than - // starting again at the current pos, to avoid successive backwards reads reloading - // the same data over and over again - bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; - if (bufferStart + bufferSize - pos <= 3) { - bufferStart += 4; // adjust buffer to include the whole int value at pos - } - buffer.limit(0); // trigger refill() on read - seekInternal(bufferStart); - refill(); - index = pos - bufferStart; - } + long index = maybeRefillBuffer(pos, Integer.BYTES); return buffer.getInt((int) index); } @Override public final long readLong(long pos) throws IOException { - long index = pos - bufferStart; - if (index < 0 || index >= buffer.limit() - 7) { - // if we're moving backwards, then try and fill up the previous page rather than - // starting again at the current pos, to avoid successive backwards reads reloading - // the same data over and over again - bufferStart = index < 0 ? Math.min(pos, Math.max(0, bufferStart - bufferSize)) : pos; - if (bufferStart + bufferSize - pos <= 7) { - bufferStart += 8; // adjust buffer to include the whole long value at pos - } - buffer.limit(0); // trigger refill() on read - seekInternal(bufferStart); - refill(); - index = pos - bufferStart; - } + long index = maybeRefillBuffer(pos, Long.BYTES); return buffer.getLong((int) index); } diff --git a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java index 105f2544aa0d..d19b2058e07b 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestBufferedIndexInput.java @@ -165,7 +165,8 @@ public void testBackwardsShortReads() throws IOException { } // readCount can be three or four, depending on whether or not we had to adjust the bufferStart // to include a whole short - assertTrue(input.readCount == 4 || input.readCount == 3); + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); } public void testBackwardsIntReads() throws IOException { @@ -182,7 +183,8 @@ public void testBackwardsIntReads() throws IOException { } // readCount can be three or four, depending on whether or not we had to adjust the bufferStart // to include a whole int - assertTrue(input.readCount == 4 || input.readCount == 3); + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); } public void testBackwardsLongReads() throws IOException { @@ -203,7 +205,8 @@ public void testBackwardsLongReads() throws IOException { } // readCount can be three or four, depending on whether or not we had to adjust the bufferStart // to include a whole long - assertTrue(input.readCount == 4 || input.readCount == 3); + assertTrue( + "Expected 4 or 3, got " + input.readCount, input.readCount == 4 || input.readCount == 3); } // byten emulates a file - byten(n) returns the n'th byte in that file. From be2341d39d76d57904ab36e32823eacf843d4b2d Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 9 Jun 2023 10:31:21 +0100 Subject: [PATCH 5/7] changes --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 50229e12dae2..996ae09216be 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -177,6 +177,9 @@ Optimizations * GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) +* GITHUB#12357: Better paging when doing backwards random reads. This speeds up + queries relying on terms in NIOFSDirectory. (Alan Woodward) + Bug Fixes --------------------- From 8d196a08ba4c49a4ea78d7680271d9d75dde10b3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 9 Jun 2023 10:42:56 +0100 Subject: [PATCH 6/7] rename --- .../org/apache/lucene/store/BufferedIndexInput.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 51dd71d49e0c..54680980bcfc 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -159,7 +159,10 @@ public final long readLong() throws IOException { } } - private long maybeRefillBuffer(long pos, int width) throws IOException { + // Computes an offset into the current buffer from an absolute position to read + // `width` bytes from. If the buffer does not contain the position, then we + // readjust the bufferStart and refill. + private long resolvePositionInBuffer(long pos, int width) throws IOException { long index = pos - bufferStart; if (index >= 0 && index < buffer.limit() - width + 1) { return index; @@ -184,25 +187,25 @@ private long maybeRefillBuffer(long pos, int width) throws IOException { @Override public final byte readByte(long pos) throws IOException { - long index = maybeRefillBuffer(pos, Byte.BYTES); + long index = resolvePositionInBuffer(pos, Byte.BYTES); return buffer.get((int) index); } @Override public final short readShort(long pos) throws IOException { - long index = maybeRefillBuffer(pos, Short.BYTES); + long index = resolvePositionInBuffer(pos, Short.BYTES); return buffer.getShort((int) index); } @Override public final int readInt(long pos) throws IOException { - long index = maybeRefillBuffer(pos, Integer.BYTES); + long index = resolvePositionInBuffer(pos, Integer.BYTES); return buffer.getInt((int) index); } @Override public final long readLong(long pos) throws IOException { - long index = maybeRefillBuffer(pos, Long.BYTES); + long index = resolvePositionInBuffer(pos, Long.BYTES); return buffer.getLong((int) index); } From a6942feae08646803e1d1cfb6701f911c774903f Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Fri, 9 Jun 2023 11:33:33 +0100 Subject: [PATCH 7/7] deef --- lucene/CHANGES.txt | 2 +- .../org/apache/lucene/store/BufferedIndexInput.java | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6f29093e0349..332e8eb4f608 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -178,7 +178,7 @@ Optimizations * GITHUB#12328: Optimize ConjunctionDISI.createConjunction (Armin Braun) * GITHUB#12357: Better paging when doing backwards random reads. This speeds up - queries relying on terms in NIOFSDirectory. (Alan Woodward) + queries relying on terms in NIOFSDirectory and SimpleFSDirectory. (Alan Woodward) * GITHUB#12339: Optimize part of duplicate calculation numDeletesToMerge in merge phase (fudongying) diff --git a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java index 54680980bcfc..50dde6aa4c92 100644 --- a/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/store/BufferedIndexInput.java @@ -164,17 +164,17 @@ public final long readLong() throws IOException { // readjust the bufferStart and refill. private long resolvePositionInBuffer(long pos, int width) throws IOException { long index = pos - bufferStart; - if (index >= 0 && index < buffer.limit() - width + 1) { + if (index >= 0 && index <= buffer.limit() - width) { return index; } if (index < 0) { // if we're moving backwards, then try and fill up the previous page rather than // starting again at the current pos, to avoid successive backwards reads reloading - // the same data over and over again - bufferStart = Math.min(pos, Math.max(0, bufferStart - bufferSize)); - if (bufferStart + bufferSize - pos < width) { - bufferStart += width - 1; // make sure we don't read over the end of the buffer - } + // the same data over and over again. We also check that we can read `width` + // bytes without going over the end of the buffer + bufferStart = Math.max(bufferStart - bufferSize, pos + width - bufferSize); + bufferStart = Math.max(bufferStart, 0); + bufferStart = Math.min(bufferStart, pos); } else { // we're moving forwards, reset the buffer to start at pos bufferStart = pos;