diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java index 3619a4d8cb59..d5a95d0d6bd4 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java @@ -16,18 +16,51 @@ */ package org.apache.lucene.util.automaton; +import com.carrotsearch.randomizedtesting.RandomizedTest; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.Iterator; +import java.util.List; +import java.util.Set; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Util; public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase { + public void testBasic() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = build(terms, false); + checkAutomaton(terms, a, false); + } + + public void testBasicBinary() throws Exception { + List terms = basicTerms(); + Collections.sort(terms); + + Automaton a = build(terms, true); + checkAutomaton(terms, a, true); + } + + public void testRandomUnicodeOnly() throws Exception { + testRandom(false); + } + + public void testRandomBinary() throws Exception { + testRandom(true); + } + public void testLargeTerms() throws Exception { byte[] b10k = new byte[10_000]; Arrays.fill(b10k, (byte) 'a'); @@ -46,6 +79,54 @@ public void testLargeTerms() throws Exception { build(Collections.singleton(new BytesRef(b1k)), false); // no exception } + private void testRandom(boolean allowBinary) throws Exception { + int iters = RandomizedTest.isNightly() ? 50 : 10; + for (int i = 0; i < iters; i++) { + int size = random().nextInt(500, 2_000); + Set terms = new HashSet<>(size); + for (int j = 0; j < size; j++) { + if (allowBinary && random().nextInt(10) < 2) { + // Sometimes random bytes term that isn't necessarily valid unicode + terms.add(newBytesRef(TestUtil.randomBinaryTerm(random()))); + } else { + terms.add(newBytesRef(TestUtil.randomRealisticUnicodeString(random()))); + } + } + + List sorted = terms.stream().sorted().toList(); + Automaton a = build(sorted, allowBinary); + checkAutomaton(sorted, a, allowBinary); + } + } + + private void checkAutomaton(List expected, Automaton a, boolean isBinary) { + CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); + ByteRunAutomaton runAutomaton = c.runAutomaton; + + for (BytesRef t : expected) { + String readable = isBinary ? t.toString() : t.utf8ToString(); + assertTrue( + readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); + } + + BytesRefBuilder scratch = new BytesRefBuilder(); + FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); + for (IntsRef r = it.next(); r != null; r = it.next()) { + BytesRef t = Util.toBytesRef(r, scratch); + assertTrue(expected.contains(t)); + } + } + + private List basicTerms() { + List terms = new ArrayList<>(); + terms.add(newBytesRef("dog")); + terms.add(newBytesRef("day")); + terms.add(newBytesRef("dad")); + terms.add(newBytesRef("cats")); + terms.add(newBytesRef("cat")); + return terms; + } + private Automaton build(Collection terms, boolean asBinary) throws IOException { if (random().nextBoolean()) { return DaciukMihovAutomatonBuilder.build(terms, asBinary);