From 03658576876eef0e8e0bbab2aa2e6c607df8c205 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Sat, 20 May 2023 09:34:51 -0700 Subject: [PATCH 1/6] Add "direct to binary" option for DaciukMihovAutomatonBuilder and use in TermInSetQuery#visit --- lucene/CHANGES.txt | 3 + .../apache/lucene/search/TermInSetQuery.java | 25 +-- .../org/apache/lucene/util/UnicodeUtil.java | 75 ++++--- .../util/automaton/StringsToAutomaton.java | 207 ++++++++++-------- .../apache/lucene/util/TestUnicodeUtil.java | 23 ++ .../automaton/TestStringsToAutomaton.java | 120 +++++++++- 6 files changed, 320 insertions(+), 133 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a81efd4c9447..53272260feac 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -158,6 +158,9 @@ Improvements * GITHUB#12290: Make memory fence in ByteBufferGuard explicit using `VarHandle.fullFence()` +* GITHUB#12320: Add "direct to binary" option for DaciukMihovAutomatonBuilder and use it in TermInSetQuery#visit. + (Greg Miller) + Optimizations --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index b989ac07849f..7cce67702771 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -17,11 +17,10 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.ArrayList; +import java.io.UncheckedIOException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.List; import java.util.SortedSet; import org.apache.lucene.index.FilteredTermsEnum; import org.apache.lucene.index.PrefixCodedTerms; @@ -35,11 +34,9 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.StringsToAutomaton; /** * Specialization for a disjunction over many terms that, by default, behaves like a {@link @@ -150,17 +147,17 @@ public void visit(QueryVisitor visitor) { } } - // TODO: this is extremely slow. we should not be doing this. + // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery + // we won't have to do this (see GH#12176). private ByteRunAutomaton asByteRunAutomaton() { - TermIterator iterator = termData.iterator(); - List automata = new ArrayList<>(); - for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { - automata.add(Automata.makeBinary(term)); + try { + Automaton a = StringsToAutomaton.build(termData.iterator(), true); + return new ByteRunAutomaton(a, true); + } catch (IOException e) { + // Shouldn't happen since termData.iterator() provides an interator implementation that + // never throws: + throw new UncheckedIOException(e); } - Automaton automaton = - Operations.determinize( - Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - return new CompiledAutomaton(automaton).runAutomaton; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index a9e9d815eb49..f4716b4c64e4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -477,38 +477,59 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { int utf8Upto = utf8.offset; final byte[] bytes = utf8.bytes; final int utf8Limit = utf8.offset + utf8.length; + UTF8CodePointState state = new UTF8CodePointState(); while (utf8Upto < utf8Limit) { - final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF]; - int v = 0; - switch (numBytes) { - case 1: - ints[utf32Count++] = bytes[utf8Upto++]; - continue; - case 2: - // 5 useful bits - v = bytes[utf8Upto++] & 31; - break; - case 3: - // 4 useful bits - v = bytes[utf8Upto++] & 15; - break; - case 4: - // 3 useful bits - v = bytes[utf8Upto++] & 7; - break; - default: - throw new IllegalArgumentException("invalid utf8"); - } + UTF8CodePointAt(bytes, utf8Upto, state); + ints[utf32Count++] = state.codePoint; + utf8Upto += state.codePointBytes; + } + + return utf32Count; + } + + /** + * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the + * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is + * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in + * this class, this assumes valid UTF8 input and does not perform full UTF8 + * validation. + * + * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is + * prematurely truncated. + */ + public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) { + UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state); + } - // TODO: this may read past utf8's limit. - final int limit = utf8Upto + numBytes - 1; - while (utf8Upto < limit) { - v = v << 6 | bytes[utf8Upto++] & 63; + private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) { + int leadByte = utf8[pos] & 0xFF; + int numBytes = utf8CodeLength[leadByte]; + state.codePointBytes = numBytes; + int v; + switch (numBytes) { + case 1 -> { + state.codePoint = leadByte; + return; } - ints[utf32Count++] = v; + case 2 -> v = leadByte & 31; // 5 useful bits + case 3 -> v = leadByte & 15; // 4 useful bits + case 4 -> v = leadByte & 7; // 3 useful bits + default -> throw new IllegalArgumentException("invalid utf8"); } - return utf32Count; + // TODO: this may read past utf8's limit. + final int limit = pos + numBytes; + pos++; + while (pos < limit) { + v = v << 6 | utf8[pos++] & 63; + } + state.codePoint = v; + } + + /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */ + public static final class UTF8CodePointState { + public int codePoint; + public int codePointBytes; } /** Shift value for lead surrogate to form a supplementary character. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 08826dcbbe3c..a9abe6a6c2db 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -16,22 +16,27 @@ */ package org.apache.lucene.util.automaton; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; -import java.util.Comparator; import java.util.HashMap; import java.util.IdentityHashMap; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.UnicodeUtil; /** * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the * algorithm described in Incremental Construction * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson. This requires - * sorted input data, but is very fast (nearly linear with the input size). + * sorted input data, but is very fast (nearly linear with the input size). Also offers the ability + * to directly build a binary {@link Automaton} representation. * + * @see #build(Collection) + * @see #build(Collection, boolean) + * @see #build(BytesRefIterator, boolean) * @see Automata#makeStringUnion(Collection) */ final class StringsToAutomaton { @@ -176,64 +181,16 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) { /** Root automaton state. */ private final State root = new State(); - /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */ - private CharsRefBuilder previous; + /** Used for input order checking (only through assertions right now) */ + private BytesRefBuilder previous; - /** A comparator used for enforcing sorted UTF8 order, used in assertions only. */ - @SuppressWarnings("deprecation") - private static final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator(); - - /** - * Add another character sequence to this automaton. The sequence must be lexicographically larger - * or equal compared to any previous sequences added to this automaton (the input must be sorted). - */ - private void add(CharsRef current) { - if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) { - throw new IllegalArgumentException( - "This builder doesn't allow terms that are larger than 1,000 characters, got " + current); - } - assert stateRegistry != null : "Automaton already built."; - assert previous == null || comparator.compare(previous.get(), current) <= 0 - : "Input must be in sorted UTF-8 order: " + previous + " >= " + current; - assert setPrevious(current); - - // Descend in the automaton (find matching prefix). - int pos = 0, max = current.length(); - State state = root; - for (; ; ) { - assert pos <= max; - if (pos == max) { - break; - } - - int codePoint = Character.codePointAt(current, pos); - State next = state.lastChild(codePoint); - if (next == null) { - break; - } - - state = next; - pos += Character.charCount(codePoint); + /** Copy current into an internal buffer. */ + private boolean setPrevious(BytesRef current) { + if (previous == null) { + previous = new BytesRefBuilder(); } - - if (state.hasChildren()) replaceOrRegister(state); - - addSuffix(state, current, pos); - } - - /** - * Finalize the automaton and return the root state. No more strings can be added to the builder - * after this call. - * - * @return Root automaton state. - */ - private State complete() { - if (this.stateRegistry == null) throw new IllegalStateException(); - - if (root.hasChildren()) replaceOrRegister(root); - - stateRegistry = null; - return root; + previous.copyBytes(current); + return true; } /** Internal recursive traversal for conversion. */ @@ -258,39 +215,127 @@ private static int convert( return converted; } + /** + * Called after adding all terms. Performs final minimization and converts to a standard {@link + * Automaton} instance. + */ + private Automaton completeAndConvert() { + // Final minimization: + if (this.stateRegistry == null) throw new IllegalStateException(); + if (root.hasChildren()) replaceOrRegister(root); + stateRegistry = null; + + // Convert: + Automaton.Builder a = new Automaton.Builder(); + convert(a, root, new IdentityHashMap<>()); + return a.finish(); + } + /** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing - * strings in UTF-8. These strings must be binary-sorted. + * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with UTF-8 + * codepoints as transition labels. */ static Automaton build(Collection input) { + return build(input, false); + } + + /** + * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing + * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either + * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code + * asBinary}. + */ + static Automaton build(Collection input, boolean asBinary) { final StringsToAutomaton builder = new StringsToAutomaton(); - CharsRefBuilder current = new CharsRefBuilder(); for (BytesRef b : input) { - current.copyUTF8Bytes(b); - builder.add(current.get()); + builder.add(b, asBinary); } - Automaton.Builder a = new Automaton.Builder(); - convert(a, builder.complete(), new IdentityHashMap<>()); + return builder.completeAndConvert(); + } - return a.finish(); + /** + * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing + * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either + * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code + * asBinary}. + */ + static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException { + final StringsToAutomaton builder = new StringsToAutomaton(); + + for (BytesRef b = input.next(); b != null; b = input.next()) { + builder.add(b, asBinary); + } + + return builder.completeAndConvert(); } - /** Copy current into an internal buffer. */ - private boolean setPrevious(CharsRef current) { - if (previous == null) { - previous = new CharsRefBuilder(); + private void add(BytesRef current, boolean asBinary) { + if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) { + throw new IllegalArgumentException( + "This builder doesn't allow terms that are larger than " + + Automata.MAX_STRING_UNION_TERM_LENGTH + + " characters, got " + + current); } - previous.copyChars(current); - return true; + assert stateRegistry != null : "Automaton already built."; + assert previous == null || previous.get().compareTo(current) <= 0 + : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current; + assert setPrevious(current); + + // Reusable state information if we're building a non-binary based automaton + UnicodeUtil.UTF8CodePointState scratchState = null; + if (asBinary == false) { + scratchState = new UnicodeUtil.UTF8CodePointState(); + } + + // Descend in the automaton (find matching prefix). + int pos = 0, max = current.length; + State next, state = root; + if (asBinary) { + while (pos < max + && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) { + state = next; + pos++; + } + } else { + while (pos < max) { + UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); + next = state.lastChild(scratchState.codePoint); + if (next == null) { + break; + } + state = next; + pos += scratchState.codePointBytes; + } + } + + if (state.hasChildren()) replaceOrRegister(state); + + // Add suffix + if (asBinary) { + while (pos < max) { + state = state.newState(current.bytes[current.offset + pos] & 0xff); + pos++; + } + } else { + while (pos < max) { + assert scratchState != null; + UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); + state = state.newState(scratchState.codePoint); + pos += scratchState.codePointBytes; + } + } + state.is_final = true; } /** * Replace last child of state with an already registered state or stateRegistry the * last child state. */ - private void replaceOrRegister(State state) { + protected void replaceOrRegister(State state) { final State child = state.lastChild(); if (child.hasChildren()) replaceOrRegister(child); @@ -302,18 +347,4 @@ private void replaceOrRegister(State state) { stateRegistry.put(child, child); } } - - /** - * Add a suffix of current starting at fromIndex (inclusive) to state - * state. - */ - private void addSuffix(State state, CharSequence current, int fromIndex) { - final int len = current.length(); - while (fromIndex < len) { - int cp = Character.codePointAt(current, fromIndex); - state = state.newState(cp); - fromIndex += Character.charCount(cp); - } - state.is_final = true; - } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java index 8e0348b706c6..9a80f1ae2eca 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java @@ -166,6 +166,29 @@ public void testUTF8toUTF32() { } } + public void testUTF8CodePointAt() { + int num = atLeast(50000); + UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState(); + for (int i = 0; i < num; i++) { + final String s = TestUtil.randomUnicodeString(random()); + final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())]; + final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); + final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len); + + int[] expected = s.codePoints().toArray(); + int pos = 0; + int expectedUpto = 0; + while (pos < utf8Len) { + UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state); + assertEquals(expected[expectedUpto], state.codePoint); + expectedUpto++; + pos += state.codePointBytes; + } + assertEquals(utf8Len, pos); + assertEquals(expected.length, expectedUpto); + } + } + public void testNewString() { final int[] codePoints = { Character.toCodePoint(Character.MIN_HIGH_SURROGATE, Character.MAX_LOW_SURROGATE), diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index a251ae748809..54a6d861775a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -16,26 +16,138 @@ */ package org.apache.lucene.util.automaton; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Util; public class TestStringsToAutomaton extends LuceneTestCase { - public void testLargeTerms() { + public void testBasic() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = build(terms, false); + checkAutomaton(terms, a, false); + } + + public void testBasicBinary() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = build(terms, true); + checkAutomaton(terms, a, true); + } + + public void testRandomUnicodeOnly() throws Exception { + testRandom(false); + } + + public void testRandomBinary() throws Exception { + testRandom(true); + } + + public void testLargeTerms() throws Exception { byte[] b10k = new byte[10_000]; Arrays.fill(b10k, (byte) 'a'); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, - () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k)))); + () -> build(Collections.singleton(new BytesRef(b10k)), false)); assertTrue( e.getMessage() - .startsWith("This builder doesn't allow terms that are larger than 1,000 characters")); + .startsWith( + "This builder doesn't allow terms that are larger than " + + StringsToAutomaton.MAX_TERM_LENGTH + + " characters")); byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000); - StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception + build(Collections.singleton(new BytesRef(b1k)), false); // no exception + } + + private void testRandom(boolean allowBinary) throws Exception { + int iters = RandomizedTest.isNightly() ? 50 : 10; + for (int i = 0; i < iters; i++) { + int size = random().nextInt(500, 2_000); + Set terms = new HashSet<>(size); + for (int j = 0; j < size; j++) { + if (allowBinary && random().nextInt(10) < 2) { + // Sometimes random bytes term that isn't necessarily valid unicode + terms.add(newBytesRef(TestUtil.randomBinaryTerm(random()))); + } else { + terms.add(newBytesRef(TestUtil.randomRealisticUnicodeString(random()))); + } + } + + List sorted = terms.stream().sorted().toList(); + Automaton a = build(sorted, allowBinary); + checkAutomaton(sorted, a, allowBinary); + } + } + + private void checkAutomaton(List expected, Automaton a, boolean isBinary) { + CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); + ByteRunAutomaton runAutomaton = c.runAutomaton; + + for (BytesRef t : expected) { + String readable = isBinary ? t.toString() : t.utf8ToString(); + assertTrue( + readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); + } + + BytesRefBuilder scratch = new BytesRefBuilder(); + FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); + for (IntsRef r = it.next(); r != null; r = it.next()) { + BytesRef t = Util.toBytesRef(r, scratch); + assertTrue(expected.contains(t)); + } + } + + private List basicTerms() { + List terms = new ArrayList<>(); + terms.add(newBytesRef("dog")); + terms.add(newBytesRef("day")); + terms.add(newBytesRef("dad")); + terms.add(newBytesRef("cats")); + terms.add(newBytesRef("cat")); + return terms; + } + + private Automaton build(Collection terms, boolean asBinary) throws IOException { + if (random().nextBoolean()) { + return StringsToAutomaton.build(terms, asBinary); + } else { + return StringsToAutomaton.build(new TermIterator(terms), asBinary); + } + } + + private static class TermIterator implements BytesRefIterator { + private final Iterator it; + + TermIterator(Collection terms) { + this.it = terms.iterator(); + } + + @Override + public BytesRef next() throws IOException { + if (it.hasNext() == false) { + return null; + } + return it.next(); + } } } From 681f384beb9431fdf0baa709af07c1dc33479be0 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Thu, 25 May 2023 17:35:32 -0700 Subject: [PATCH 2/6] plumb through Automata --- .../apache/lucene/search/TermInSetQuery.java | 4 +- .../lucene/util/automaton/Automata.java | 46 ++++++++++++++++++- .../util/automaton/StringsToAutomaton.java | 5 +- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 7cce67702771..542f94e65f8e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -34,9 +34,9 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.StringsToAutomaton; /** * Specialization for a disjunction over many terms that, by default, behaves like a {@link @@ -151,7 +151,7 @@ public void visit(QueryVisitor visitor) { // we won't have to do this (see GH#12176). private ByteRunAutomaton asByteRunAutomaton() { try { - Automaton a = StringsToAutomaton.build(termData.iterator(), true); + Automaton a = Automata.makeBinaryStringUnion(termData.iterator()); return new ByteRunAutomaton(a, true); } catch (IOException e) { // Shouldn't happen since termData.iterator() provides an interator implementation that diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java index 20233374ce93..89522781fdbd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java @@ -29,9 +29,11 @@ package org.apache.lucene.util.automaton; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.StringHelper; /** @@ -578,7 +580,49 @@ public static Automaton makeStringUnion(Collection utf8Strings) { if (utf8Strings.isEmpty()) { return makeEmpty(); } else { - return StringsToAutomaton.build(utf8Strings); + return StringsToAutomaton.build(utf8Strings, false); } } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the given + * collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton + * will be built in a binary representation. + * + * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order. + * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary + * based (UTF-8 encoded byte transition labels). + */ + public static Automaton makeBinaryStringUnion(Collection utf8Strings) { + if (utf8Strings.isEmpty()) { + return makeEmpty(); + } else { + return StringsToAutomaton.build(utf8Strings, true); + } + } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the given + * iterator of {@link BytesRef}s representing UTF-8 encoded strings. + * + * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order. + * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint + * based (full unicode codepoints on transitions). + */ + public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException { + return StringsToAutomaton.build(utf8Strings, false); + } + + /** + * Returns a new (deterministic and minimal) automaton that accepts the union of the given + * iterator of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton will + * be built in a binary representation. + * + * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order. + * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary + * based (UTF-8 encoded byte transition labels). + */ + public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException { + return StringsToAutomaton.build(utf8Strings, true); + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index a9abe6a6c2db..a5e42b98e63e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -38,6 +38,9 @@ * @see #build(Collection, boolean) * @see #build(BytesRefIterator, boolean) * @see Automata#makeStringUnion(Collection) + * @see Automata#makeBinaryStringUnion(Collection) + * @see Automata#makeStringUnion(BytesRefIterator) + * @see Automata#makeBinaryStringUnion(BytesRefIterator) */ final class StringsToAutomaton { @@ -335,7 +338,7 @@ private void add(BytesRef current, boolean asBinary) { * Replace last child of state with an already registered state or stateRegistry the * last child state. */ - protected void replaceOrRegister(State state) { + private void replaceOrRegister(State state) { final State child = state.lastChild(); if (child.hasChildren()) replaceOrRegister(child); From 3163d4defeeee0a8cde466715fdbf10cb1d5501c Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Thu, 25 May 2023 18:42:27 -0700 Subject: [PATCH 3/6] little more cleanup --- .../org/apache/lucene/util/UnicodeUtil.java | 35 ++++++++++--------- .../util/automaton/StringsToAutomaton.java | 28 +++++++-------- .../apache/lucene/util/TestUnicodeUtil.java | 9 +++-- .../automaton/TestStringsToAutomaton.java | 2 ++ 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index f4716b4c64e4..fae6a25023b1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -477,11 +477,11 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { int utf8Upto = utf8.offset; final byte[] bytes = utf8.bytes; final int utf8Limit = utf8.offset + utf8.length; - UTF8CodePointState state = new UTF8CodePointState(); + UTF8CodePoint reuse = null; while (utf8Upto < utf8Limit) { - UTF8CodePointAt(bytes, utf8Upto, state); - ints[utf32Count++] = state.codePoint; - utf8Upto += state.codePointBytes; + reuse = codePointAt(bytes, utf8Upto, reuse); + ints[utf32Count++] = reuse.codePoint; + utf8Upto += reuse.codePointBytes; } return utf32Count; @@ -489,27 +489,26 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { /** * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the - * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is - * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in - * this class, this assumes valid UTF8 input and does not perform full UTF8 + * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this + * class, this assumes valid UTF8 input and does not perform full UTF8 * validation. * * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is * prematurely truncated. */ - public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) { - UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state); - } + public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) { + if (reuse == null) { + reuse = new UTF8CodePoint(); + } - private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) { int leadByte = utf8[pos] & 0xFF; int numBytes = utf8CodeLength[leadByte]; - state.codePointBytes = numBytes; + reuse.codePointBytes = numBytes; int v; switch (numBytes) { case 1 -> { - state.codePoint = leadByte; - return; + reuse.codePoint = leadByte; + return reuse; } case 2 -> v = leadByte & 31; // 5 useful bits case 3 -> v = leadByte & 15; // 4 useful bits @@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta while (pos < limit) { v = v << 6 | utf8[pos++] & 63; } - state.codePoint = v; + reuse.codePoint = v; + + return reuse; } - /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */ - public static final class UTF8CodePointState { + /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */ + public static final class UTF8CodePoint { public int codePoint; public int codePointBytes; } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index a5e42b98e63e..82b2ce0fd834 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -288,30 +288,27 @@ private void add(BytesRef current, boolean asBinary) { : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current; assert setPrevious(current); - // Reusable state information if we're building a non-binary based automaton - UnicodeUtil.UTF8CodePointState scratchState = null; - if (asBinary == false) { - scratchState = new UnicodeUtil.UTF8CodePointState(); - } + // Reusable codepoint information if we're building a non-binary based automaton + UnicodeUtil.UTF8CodePoint codePoint = null; // Descend in the automaton (find matching prefix). - int pos = 0, max = current.length; + byte[] bytes = current.bytes; + int pos = current.offset, max = current.offset + current.length; State next, state = root; if (asBinary) { - while (pos < max - && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) { + while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) { state = next; pos++; } } else { while (pos < max) { - UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); - next = state.lastChild(scratchState.codePoint); + codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + next = state.lastChild(codePoint.codePoint); if (next == null) { break; } state = next; - pos += scratchState.codePointBytes; + pos += codePoint.codePointBytes; } } @@ -320,15 +317,14 @@ private void add(BytesRef current, boolean asBinary) { // Add suffix if (asBinary) { while (pos < max) { - state = state.newState(current.bytes[current.offset + pos] & 0xff); + state = state.newState(bytes[pos] & 0xff); pos++; } } else { while (pos < max) { - assert scratchState != null; - UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); - state = state.newState(scratchState.codePoint); - pos += scratchState.codePointBytes; + codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + state = state.newState(codePoint.codePoint); + pos += codePoint.codePointBytes; } } state.is_final = true; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java index 9a80f1ae2eca..dde8c2236f51 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java @@ -168,21 +168,20 @@ public void testUTF8toUTF32() { public void testUTF8CodePointAt() { int num = atLeast(50000); - UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState(); + UnicodeUtil.UTF8CodePoint reuse = null; for (int i = 0; i < num; i++) { final String s = TestUtil.randomUnicodeString(random()); final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())]; final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); - final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len); int[] expected = s.codePoints().toArray(); int pos = 0; int expectedUpto = 0; while (pos < utf8Len) { - UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state); - assertEquals(expected[expectedUpto], state.codePoint); + reuse = UnicodeUtil.codePointAt(utf8, pos, reuse); + assertEquals(expected[expectedUpto], reuse.codePoint); expectedUpto++; - pos += state.codePointBytes; + pos += reuse.codePointBytes; } assertEquals(utf8Len, pos); assertEquals(expected.length, expectedUpto); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 54a6d861775a..b9c86a775eca 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -103,12 +103,14 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); ByteRunAutomaton runAutomaton = c.runAutomaton; + // Make sure every expected term is accepted for (BytesRef t : expected) { String readable = isBinary ? t.toString() : t.utf8ToString(); assertTrue( readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); } + // Make sure every term produced by the automaton is expected BytesRefBuilder scratch = new BytesRefBuilder(); FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); for (IntsRef r = it.next(); r != null; r = it.next()) { From e7ce7e3fa685ec81c9e508df5f0f55c6c7da9162 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Fri, 26 May 2023 07:40:29 -0700 Subject: [PATCH 4/6] test basic minimization --- .../automaton/TestStringsToAutomaton.java | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index b9c86a775eca..09b3730c88fd 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -43,6 +43,7 @@ public void testBasic() throws Exception { Automaton a = build(terms, false); checkAutomaton(terms, a, false); + checkMinimized(a); } public void testBasicBinary() throws Exception { @@ -51,6 +52,35 @@ public void testBasicBinary() throws Exception { Automaton a = build(terms, true); checkAutomaton(terms, a, true); + checkMinimized(a); + } + + public void testRandomMinimized() throws Exception { + int iters = RandomizedTest.isNightly() ? 20 : 5; + for (int i = 0; i < iters; i++) { + boolean buildBinary = random().nextBoolean(); + int size = random().nextInt(2, 50); + Set terms = new HashSet<>(); + List automatonList = new ArrayList<>(size); + for (int j = 0; j < size; j++) { + if (buildBinary) { + BytesRef t = TestUtil.randomBinaryTerm(random(), 8); + terms.add(t); + automatonList.add(Automata.makeBinary(t)); + } else { + String s = TestUtil.randomRealisticUnicodeString(random(), 8); + terms.add(newBytesRef(s)); + automatonList.add(Automata.makeString(s)); + } + } + List sortedTerms = terms.stream().sorted().toList(); + + Automaton expected = + MinimizationOperations.minimize( + Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + Automaton actual = build(sortedTerms, buildBinary); + assertSameAutomaton(expected, actual); + } } public void testRandomUnicodeOnly() throws Exception { @@ -119,6 +149,18 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina } } + private void checkMinimized(Automaton a) { + Automaton minimized = + MinimizationOperations.minimize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + assertSameAutomaton(minimized, a); + } + + private static void assertSameAutomaton(Automaton a, Automaton b) { + assertEquals(a.getNumStates(), b.getNumStates()); + assertEquals(a.getNumTransitions(), b.getNumTransitions()); + assertTrue(Operations.sameLanguage(a, b)); + } + private List basicTerms() { List terms = new ArrayList<>(); terms.add(newBytesRef("dog")); @@ -137,7 +179,7 @@ private Automaton build(Collection terms, boolean asBinary) throws IOE } } - private static class TermIterator implements BytesRefIterator { + private static final class TermIterator implements BytesRefIterator { private final Iterator it; TermIterator(Collection terms) { From 84061eee79591bc959636d31b75b70c2d6f876a2 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Fri, 26 May 2023 08:49:58 -0700 Subject: [PATCH 5/6] addressing some feedback --- .../java/org/apache/lucene/util/UnicodeUtil.java | 15 +++++++-------- .../lucene/util/automaton/StringsToAutomaton.java | 4 ++-- .../org/apache/lucene/util/TestUnicodeUtil.java | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index fae6a25023b1..bb6378c9656b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -481,7 +481,7 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { while (utf8Upto < utf8Limit) { reuse = codePointAt(bytes, utf8Upto, reuse); ints[utf32Count++] = reuse.codePoint; - utf8Upto += reuse.codePointBytes; + utf8Upto += reuse.numBytes; } return utf32Count; @@ -491,10 +491,8 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this * class, this assumes valid UTF8 input and does not perform full UTF8 - * validation. - * - * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is - * prematurely truncated. + * validation. Passing invalid UTF8 or a position that is not a valid header byte position may + * result in undefined behavior. This makes no attempt to synchronize or validate. */ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) { if (reuse == null) { @@ -503,7 +501,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus int leadByte = utf8[pos] & 0xFF; int numBytes = utf8CodeLength[leadByte]; - reuse.codePointBytes = numBytes; + reuse.numBytes = numBytes; int v; switch (numBytes) { case 1 -> { @@ -513,7 +511,8 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus case 2 -> v = leadByte & 31; // 5 useful bits case 3 -> v = leadByte & 15; // 4 useful bits case 4 -> v = leadByte & 7; // 3 useful bits - default -> throw new IllegalArgumentException("invalid utf8"); + default -> throw new IllegalArgumentException( + "Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte)); } // TODO: this may read past utf8's limit. @@ -530,7 +529,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */ public static final class UTF8CodePoint { public int codePoint; - public int codePointBytes; + public int numBytes; } /** Shift value for lead surrogate to form a supplementary character. */ diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 82b2ce0fd834..4ab20328e2df 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -308,7 +308,7 @@ private void add(BytesRef current, boolean asBinary) { break; } state = next; - pos += codePoint.codePointBytes; + pos += codePoint.numBytes; } } @@ -324,7 +324,7 @@ private void add(BytesRef current, boolean asBinary) { while (pos < max) { codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); state = state.newState(codePoint.codePoint); - pos += codePoint.codePointBytes; + pos += codePoint.numBytes; } } state.is_final = true; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java index dde8c2236f51..2f309e945b74 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java @@ -181,7 +181,7 @@ public void testUTF8CodePointAt() { reuse = UnicodeUtil.codePointAt(utf8, pos, reuse); assertEquals(expected[expectedUpto], reuse.codePoint); expectedUpto++; - pos += reuse.codePointBytes; + pos += reuse.numBytes; } assertEquals(utf8Len, pos); assertEquals(expected.length, expectedUpto); From 485ebc82ce1f6fc923cad05a7b619b3a2e554c66 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Fri, 26 May 2023 10:03:22 -0700 Subject: [PATCH 6/6] fixup rebase --- .../lucene/util/automaton/StringsToAutomaton.java | 15 ++------------- .../util/automaton/TestStringsToAutomaton.java | 2 +- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index 4ab20328e2df..6c66dc6fd9ed 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -32,11 +32,9 @@ * algorithm described in Incremental Construction * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson. This requires * sorted input data, but is very fast (nearly linear with the input size). Also offers the ability - * to directly build a binary {@link Automaton} representation. + * to directly build a binary {@link Automaton} representation. Users should access this + * functionality through {@link Automata} static methods. * - * @see #build(Collection) - * @see #build(Collection, boolean) - * @see #build(BytesRefIterator, boolean) * @see Automata#makeStringUnion(Collection) * @see Automata#makeBinaryStringUnion(Collection) * @see Automata#makeStringUnion(BytesRefIterator) @@ -234,15 +232,6 @@ private Automaton completeAndConvert() { return a.finish(); } - /** - * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing - * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with UTF-8 - * codepoints as transition labels. - */ - static Automaton build(Collection input) { - return build(input, false); - } - /** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 09b3730c88fd..f093623c3702 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -102,7 +102,7 @@ public void testLargeTerms() throws Exception { e.getMessage() .startsWith( "This builder doesn't allow terms that are larger than " - + StringsToAutomaton.MAX_TERM_LENGTH + + Automata.MAX_STRING_UNION_TERM_LENGTH + " characters")); byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);