From 03658576876eef0e8e0bbab2aa2e6c607df8c205 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Sat, 20 May 2023 09:34:51 -0700
Subject: [PATCH 1/6] Add "direct to binary" option for
 DaciukMihovAutomatonBuilder and use in TermInSetQuery#visit

---
 lucene/CHANGES.txt                            |   3 +
 .../apache/lucene/search/TermInSetQuery.java  |  25 +--
 .../org/apache/lucene/util/UnicodeUtil.java   |  75 ++++---
 .../util/automaton/StringsToAutomaton.java    | 207 ++++++++++--------
 .../apache/lucene/util/TestUnicodeUtil.java   |  23 ++
 .../automaton/TestStringsToAutomaton.java     | 120 +++++++++-
 6 files changed, 320 insertions(+), 133 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a81efd4c9447..53272260feac 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -158,6 +158,9 @@ Improvements
 
 * GITHUB#12290: Make memory fence in ByteBufferGuard explicit using `VarHandle.fullFence()`
 
+* GITHUB#12320: Add "direct to binary" option for DaciukMihovAutomatonBuilder and use it in TermInSetQuery#visit.
+  (Greg Miller)
+
 Optimizations
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index b989ac07849f..7cce67702771 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -17,11 +17,10 @@
 package org.apache.lucene.search;
 
 import java.io.IOException;
-import java.util.ArrayList;
+import java.io.UncheckedIOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.List;
 import java.util.SortedSet;
 import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.PrefixCodedTerms;
@@ -35,11 +34,9 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
-import org.apache.lucene.util.automaton.CompiledAutomaton;
-import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 /**
  * Specialization for a disjunction over many terms that, by default, behaves like a {@link
@@ -150,17 +147,17 @@ public void visit(QueryVisitor visitor) {
     }
   }
 
-  // TODO: this is extremely slow. we should not be doing this.
+  // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
+  // we won't have to do this (see GH#12176).
   private ByteRunAutomaton asByteRunAutomaton() {
-    TermIterator iterator = termData.iterator();
-    List<Automaton> automata = new ArrayList<>();
-    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
-      automata.add(Automata.makeBinary(term));
+    try {
+      Automaton a = StringsToAutomaton.build(termData.iterator(), true);
+      return new ByteRunAutomaton(a, true);
+    } catch (IOException e) {
+      // Shouldn't happen since termData.iterator() provides an interator implementation that
+      // never throws:
+      throw new UncheckedIOException(e);
     }
-    Automaton automaton =
-        Operations.determinize(
-            Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
-    return new CompiledAutomaton(automaton).runAutomaton;
   }
 
   @Override
diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index a9e9d815eb49..f4716b4c64e4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -477,38 +477,59 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
     int utf8Upto = utf8.offset;
     final byte[] bytes = utf8.bytes;
     final int utf8Limit = utf8.offset + utf8.length;
+    UTF8CodePointState state = new UTF8CodePointState();
     while (utf8Upto < utf8Limit) {
-      final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
-      int v = 0;
-      switch (numBytes) {
-        case 1:
-          ints[utf32Count++] = bytes[utf8Upto++];
-          continue;
-        case 2:
-          // 5 useful bits
-          v = bytes[utf8Upto++] & 31;
-          break;
-        case 3:
-          // 4 useful bits
-          v = bytes[utf8Upto++] & 15;
-          break;
-        case 4:
-          // 3 useful bits
-          v = bytes[utf8Upto++] & 7;
-          break;
-        default:
-          throw new IllegalArgumentException("invalid utf8");
-      }
+      UTF8CodePointAt(bytes, utf8Upto, state);
+      ints[utf32Count++] = state.codePoint;
+      utf8Upto += state.codePointBytes;
+    }
+
+    return utf32Count;
+  }
+
+  /**
+   * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
+   * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is
+   * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in
+   * this class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
+   * validation.
+   *
+   * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
+   *     prematurely truncated.
+   */
+  public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) {
+    UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state);
+  }
 
-      // TODO: this may read past utf8's limit.
-      final int limit = utf8Upto + numBytes - 1;
-      while (utf8Upto < limit) {
-        v = v << 6 | bytes[utf8Upto++] & 63;
+  private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) {
+    int leadByte = utf8[pos] & 0xFF;
+    int numBytes = utf8CodeLength[leadByte];
+    state.codePointBytes = numBytes;
+    int v;
+    switch (numBytes) {
+      case 1 -> {
+        state.codePoint = leadByte;
+        return;
       }
-      ints[utf32Count++] = v;
+      case 2 -> v = leadByte & 31; // 5 useful bits
+      case 3 -> v = leadByte & 15; // 4 useful bits
+      case 4 -> v = leadByte & 7; // 3 useful bits
+      default -> throw new IllegalArgumentException("invalid utf8");
     }
 
-    return utf32Count;
+    // TODO: this may read past utf8's limit.
+    final int limit = pos + numBytes;
+    pos++;
+    while (pos < limit) {
+      v = v << 6 | utf8[pos++] & 63;
+    }
+    state.codePoint = v;
+  }
+
+  /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */
+  public static final class UTF8CodePointState {
+    public int codePoint;
+    public int codePointBytes;
   }
 
   /** Shift value for lead surrogate to form a supplementary character. */
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 08826dcbbe3c..a9abe6a6c2db 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -16,22 +16,27 @@
  */
 package org.apache.lucene.util.automaton;
 
+import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.IdentityHashMap;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * Builds a minimal, deterministic {@link Automaton} that accepts a set of strings using the
  * algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
  * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
- * sorted input data, but is very fast (nearly linear with the input size).
+ * sorted input data, but is very fast (nearly linear with the input size). Also offers the ability
+ * to directly build a binary {@link Automaton} representation.
  *
+ * @see #build(Collection)
+ * @see #build(Collection, boolean)
+ * @see #build(BytesRefIterator, boolean)
  * @see Automata#makeStringUnion(Collection)
  */
 final class StringsToAutomaton {
@@ -176,64 +181,16 @@ private static boolean referenceEquals(Object[] a1, Object[] a2) {
   /** Root automaton state. */
   private final State root = new State();
 
-  /** Previous sequence added to the automaton in {@link #add(CharsRef)}. */
-  private CharsRefBuilder previous;
+  /** Used for input order checking (only through assertions right now) */
+  private BytesRefBuilder previous;
 
-  /** A comparator used for enforcing sorted UTF8 order, used in assertions only. */
-  @SuppressWarnings("deprecation")
-  private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
-
-  /**
-   * Add another character sequence to this automaton. The sequence must be lexicographically larger
-   * or equal compared to any previous sequences added to this automaton (the input must be sorted).
-   */
-  private void add(CharsRef current) {
-    if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
-      throw new IllegalArgumentException(
-          "This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
-    }
-    assert stateRegistry != null : "Automaton already built.";
-    assert previous == null || comparator.compare(previous.get(), current) <= 0
-        : "Input must be in sorted UTF-8 order: " + previous + " >= " + current;
-    assert setPrevious(current);
-
-    // Descend in the automaton (find matching prefix).
-    int pos = 0, max = current.length();
-    State state = root;
-    for (; ; ) {
-      assert pos <= max;
-      if (pos == max) {
-        break;
-      }
-
-      int codePoint = Character.codePointAt(current, pos);
-      State next = state.lastChild(codePoint);
-      if (next == null) {
-        break;
-      }
-
-      state = next;
-      pos += Character.charCount(codePoint);
+  /** Copy <code>current</code> into an internal buffer. */
+  private boolean setPrevious(BytesRef current) {
+    if (previous == null) {
+      previous = new BytesRefBuilder();
     }
-
-    if (state.hasChildren()) replaceOrRegister(state);
-
-    addSuffix(state, current, pos);
-  }
-
-  /**
-   * Finalize the automaton and return the root state. No more strings can be added to the builder
-   * after this call.
-   *
-   * @return Root automaton state.
-   */
-  private State complete() {
-    if (this.stateRegistry == null) throw new IllegalStateException();
-
-    if (root.hasChildren()) replaceOrRegister(root);
-
-    stateRegistry = null;
-    return root;
+    previous.copyBytes(current);
+    return true;
   }
 
   /** Internal recursive traversal for conversion. */
@@ -258,39 +215,127 @@ private static int convert(
     return converted;
   }
 
+  /**
+   * Called after adding all terms. Performs final minimization and converts to a standard {@link
+   * Automaton} instance.
+   */
+  private Automaton completeAndConvert() {
+    // Final minimization:
+    if (this.stateRegistry == null) throw new IllegalStateException();
+    if (root.hasChildren()) replaceOrRegister(root);
+    stateRegistry = null;
+
+    // Convert:
+    Automaton.Builder a = new Automaton.Builder();
+    convert(a, root, new IdentityHashMap<>());
+    return a.finish();
+  }
+
   /**
    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
-   * strings in UTF-8. These strings must be binary-sorted.
+   * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with UTF-8
+   * codepoints as transition labels.
    */
   static Automaton build(Collection<BytesRef> input) {
+    return build(input, false);
+  }
+
+  /**
+   * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
+   * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either
+   * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
+   * asBinary}.
+   */
+  static Automaton build(Collection<BytesRef> input, boolean asBinary) {
     final StringsToAutomaton builder = new StringsToAutomaton();
 
-    CharsRefBuilder current = new CharsRefBuilder();
     for (BytesRef b : input) {
-      current.copyUTF8Bytes(b);
-      builder.add(current.get());
+      builder.add(b, asBinary);
     }
 
-    Automaton.Builder a = new Automaton.Builder();
-    convert(a, builder.complete(), new IdentityHashMap<>());
+    return builder.completeAndConvert();
+  }
 
-    return a.finish();
+  /**
+   * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
+   * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either
+   * UTF-8 codepoints as transition labels or binary (compiled) transition labels based on {@code
+   * asBinary}.
+   */
+  static Automaton build(BytesRefIterator input, boolean asBinary) throws IOException {
+    final StringsToAutomaton builder = new StringsToAutomaton();
+
+    for (BytesRef b = input.next(); b != null; b = input.next()) {
+      builder.add(b, asBinary);
+    }
+
+    return builder.completeAndConvert();
   }
 
-  /** Copy <code>current</code> into an internal buffer. */
-  private boolean setPrevious(CharsRef current) {
-    if (previous == null) {
-      previous = new CharsRefBuilder();
+  private void add(BytesRef current, boolean asBinary) {
+    if (current.length > Automata.MAX_STRING_UNION_TERM_LENGTH) {
+      throw new IllegalArgumentException(
+          "This builder doesn't allow terms that are larger than "
+              + Automata.MAX_STRING_UNION_TERM_LENGTH
+              + " characters, got "
+              + current);
     }
-    previous.copyChars(current);
-    return true;
+    assert stateRegistry != null : "Automaton already built.";
+    assert previous == null || previous.get().compareTo(current) <= 0
+        : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
+    assert setPrevious(current);
+
+    // Reusable state information if we're building a non-binary based automaton
+    UnicodeUtil.UTF8CodePointState scratchState = null;
+    if (asBinary == false) {
+      scratchState = new UnicodeUtil.UTF8CodePointState();
+    }
+
+    // Descend in the automaton (find matching prefix).
+    int pos = 0, max = current.length;
+    State next, state = root;
+    if (asBinary) {
+      while (pos < max
+          && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) {
+        state = next;
+        pos++;
+      }
+    } else {
+      while (pos < max) {
+        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
+        next = state.lastChild(scratchState.codePoint);
+        if (next == null) {
+          break;
+        }
+        state = next;
+        pos += scratchState.codePointBytes;
+      }
+    }
+
+    if (state.hasChildren()) replaceOrRegister(state);
+
+    // Add suffix
+    if (asBinary) {
+      while (pos < max) {
+        state = state.newState(current.bytes[current.offset + pos] & 0xff);
+        pos++;
+      }
+    } else {
+      while (pos < max) {
+        assert scratchState != null;
+        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
+        state = state.newState(scratchState.codePoint);
+        pos += scratchState.codePointBytes;
+      }
+    }
+    state.is_final = true;
   }
 
   /**
    * Replace last child of <code>state</code> with an already registered state or stateRegistry the
    * last child state.
    */
-  private void replaceOrRegister(State state) {
+  protected void replaceOrRegister(State state) {
     final State child = state.lastChild();
 
     if (child.hasChildren()) replaceOrRegister(child);
@@ -302,18 +347,4 @@ private void replaceOrRegister(State state) {
       stateRegistry.put(child, child);
     }
   }
-
-  /**
-   * Add a suffix of <code>current</code> starting at <code>fromIndex</code> (inclusive) to state
-   * <code>state</code>.
-   */
-  private void addSuffix(State state, CharSequence current, int fromIndex) {
-    final int len = current.length();
-    while (fromIndex < len) {
-      int cp = Character.codePointAt(current, fromIndex);
-      state = state.newState(cp);
-      fromIndex += Character.charCount(cp);
-    }
-    state.is_final = true;
-  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index 8e0348b706c6..9a80f1ae2eca 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -166,6 +166,29 @@ public void testUTF8toUTF32() {
     }
   }
 
+  public void testUTF8CodePointAt() {
+    int num = atLeast(50000);
+    UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState();
+    for (int i = 0; i < num; i++) {
+      final String s = TestUtil.randomUnicodeString(random());
+      final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
+      final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
+      final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len);
+
+      int[] expected = s.codePoints().toArray();
+      int pos = 0;
+      int expectedUpto = 0;
+      while (pos < utf8Len) {
+        UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state);
+        assertEquals(expected[expectedUpto], state.codePoint);
+        expectedUpto++;
+        pos += state.codePointBytes;
+      }
+      assertEquals(utf8Len, pos);
+      assertEquals(expected.length, expectedUpto);
+    }
+  }
+
   public void testNewString() {
     final int[] codePoints = {
       Character.toCodePoint(Character.MIN_HIGH_SURROGATE, Character.MAX_LOW_SURROGATE),
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index a251ae748809..54a6d861775a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -16,26 +16,138 @@
  */
 package org.apache.lucene.util.automaton;
 
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Util;
 
 public class TestStringsToAutomaton extends LuceneTestCase {
 
-  public void testLargeTerms() {
+  public void testBasic() throws Exception {
+    List<BytesRef> terms = basicTerms();
+    Collections.sort(terms);
+
+    Automaton a = build(terms, false);
+    checkAutomaton(terms, a, false);
+  }
+
+  public void testBasicBinary() throws Exception {
+    List<BytesRef> terms = basicTerms();
+    Collections.sort(terms);
+
+    Automaton a = build(terms, true);
+    checkAutomaton(terms, a, true);
+  }
+
+  public void testRandomUnicodeOnly() throws Exception {
+    testRandom(false);
+  }
+
+  public void testRandomBinary() throws Exception {
+    testRandom(true);
+  }
+
+  public void testLargeTerms() throws Exception {
     byte[] b10k = new byte[10_000];
     Arrays.fill(b10k, (byte) 'a');
     IllegalArgumentException e =
         expectThrows(
             IllegalArgumentException.class,
-            () -> StringsToAutomaton.build(Collections.singleton(new BytesRef(b10k))));
+            () -> build(Collections.singleton(new BytesRef(b10k)), false));
     assertTrue(
         e.getMessage()
-            .startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));
+            .startsWith(
+                "This builder doesn't allow terms that are larger than "
+                    + StringsToAutomaton.MAX_TERM_LENGTH
+                    + " characters"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
-    StringsToAutomaton.build(Collections.singleton(new BytesRef(b1k))); // no exception
+    build(Collections.singleton(new BytesRef(b1k)), false); // no exception
+  }
+
+  private void testRandom(boolean allowBinary) throws Exception {
+    int iters = RandomizedTest.isNightly() ? 50 : 10;
+    for (int i = 0; i < iters; i++) {
+      int size = random().nextInt(500, 2_000);
+      Set<BytesRef> terms = new HashSet<>(size);
+      for (int j = 0; j < size; j++) {
+        if (allowBinary && random().nextInt(10) < 2) {
+          // Sometimes random bytes term that isn't necessarily valid unicode
+          terms.add(newBytesRef(TestUtil.randomBinaryTerm(random())));
+        } else {
+          terms.add(newBytesRef(TestUtil.randomRealisticUnicodeString(random())));
+        }
+      }
+
+      List<BytesRef> sorted = terms.stream().sorted().toList();
+      Automaton a = build(sorted, allowBinary);
+      checkAutomaton(sorted, a, allowBinary);
+    }
+  }
+
+  private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBinary) {
+    CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
+    ByteRunAutomaton runAutomaton = c.runAutomaton;
+
+    for (BytesRef t : expected) {
+      String readable = isBinary ? t.toString() : t.utf8ToString();
+      assertTrue(
+          readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
+    }
+
+    BytesRefBuilder scratch = new BytesRefBuilder();
+    FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
+    for (IntsRef r = it.next(); r != null; r = it.next()) {
+      BytesRef t = Util.toBytesRef(r, scratch);
+      assertTrue(expected.contains(t));
+    }
+  }
+
+  private List<BytesRef> basicTerms() {
+    List<BytesRef> terms = new ArrayList<>();
+    terms.add(newBytesRef("dog"));
+    terms.add(newBytesRef("day"));
+    terms.add(newBytesRef("dad"));
+    terms.add(newBytesRef("cats"));
+    terms.add(newBytesRef("cat"));
+    return terms;
+  }
+
+  private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOException {
+    if (random().nextBoolean()) {
+      return StringsToAutomaton.build(terms, asBinary);
+    } else {
+      return StringsToAutomaton.build(new TermIterator(terms), asBinary);
+    }
+  }
+
+  private static class TermIterator implements BytesRefIterator {
+    private final Iterator<BytesRef> it;
+
+    TermIterator(Collection<BytesRef> terms) {
+      this.it = terms.iterator();
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      if (it.hasNext() == false) {
+        return null;
+      }
+      return it.next();
+    }
   }
 }

From 681f384beb9431fdf0baa709af07c1dc33479be0 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Thu, 25 May 2023 17:35:32 -0700
Subject: [PATCH 2/6] plumb through Automata

---
 .../apache/lucene/search/TermInSetQuery.java  |  4 +-
 .../lucene/util/automaton/Automata.java       | 46 ++++++++++++++++++-
 .../util/automaton/StringsToAutomaton.java    |  5 +-
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 7cce67702771..542f94e65f8e 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -34,9 +34,9 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
-import org.apache.lucene.util.automaton.StringsToAutomaton;
 
 /**
  * Specialization for a disjunction over many terms that, by default, behaves like a {@link
@@ -151,7 +151,7 @@ public void visit(QueryVisitor visitor) {
   // we won't have to do this (see GH#12176).
   private ByteRunAutomaton asByteRunAutomaton() {
     try {
-      Automaton a = StringsToAutomaton.build(termData.iterator(), true);
+      Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
       return new ByteRunAutomaton(a, true);
     } catch (IOException e) {
       // Shouldn't happen since termData.iterator() provides an interator implementation that
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 20233374ce93..89522781fdbd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -29,9 +29,11 @@
 
 package org.apache.lucene.util.automaton;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
 import org.apache.lucene.util.StringHelper;
 
 /**
@@ -578,7 +580,49 @@ public static Automaton makeStringUnion(Collection<BytesRef> utf8Strings) {
     if (utf8Strings.isEmpty()) {
       return makeEmpty();
     } else {
-      return StringsToAutomaton.build(utf8Strings);
+      return StringsToAutomaton.build(utf8Strings, false);
     }
   }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * collection of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton
+   * will be built in a binary representation.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The collection must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
+   *     based (UTF-8 encoded byte transition labels).
+   */
+  public static Automaton makeBinaryStringUnion(Collection<BytesRef> utf8Strings) {
+    if (utf8Strings.isEmpty()) {
+      return makeEmpty();
+    } else {
+      return StringsToAutomaton.build(utf8Strings, true);
+    }
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * iterator of {@link BytesRef}s representing UTF-8 encoded strings.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is codepoint
+   *     based (full unicode codepoints on transitions).
+   */
+  public static Automaton makeStringUnion(BytesRefIterator utf8Strings) throws IOException {
+    return StringsToAutomaton.build(utf8Strings, false);
+  }
+
+  /**
+   * Returns a new (deterministic and minimal) automaton that accepts the union of the given
+   * iterator of {@link BytesRef}s representing UTF-8 encoded strings. The resulting automaton will
+   * be built in a binary representation.
+   *
+   * @param utf8Strings The input strings, UTF-8 encoded. The iterator must be in sorted order.
+   * @return An {@link Automaton} accepting all input strings. The resulting automaton is binary
+   *     based (UTF-8 encoded byte transition labels).
+   */
+  public static Automaton makeBinaryStringUnion(BytesRefIterator utf8Strings) throws IOException {
+    return StringsToAutomaton.build(utf8Strings, true);
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index a9abe6a6c2db..a5e42b98e63e 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -38,6 +38,9 @@
  * @see #build(Collection, boolean)
  * @see #build(BytesRefIterator, boolean)
  * @see Automata#makeStringUnion(Collection)
+ * @see Automata#makeBinaryStringUnion(Collection)
+ * @see Automata#makeStringUnion(BytesRefIterator)
+ * @see Automata#makeBinaryStringUnion(BytesRefIterator)
  */
 final class StringsToAutomaton {
 
@@ -335,7 +338,7 @@ private void add(BytesRef current, boolean asBinary) {
    * Replace last child of <code>state</code> with an already registered state or stateRegistry the
    * last child state.
    */
-  protected void replaceOrRegister(State state) {
+  private void replaceOrRegister(State state) {
     final State child = state.lastChild();
 
     if (child.hasChildren()) replaceOrRegister(child);

From 3163d4defeeee0a8cde466715fdbf10cb1d5501c Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Thu, 25 May 2023 18:42:27 -0700
Subject: [PATCH 3/6] little more cleanup

---
 .../org/apache/lucene/util/UnicodeUtil.java   | 35 ++++++++++---------
 .../util/automaton/StringsToAutomaton.java    | 28 +++++++--------
 .../apache/lucene/util/TestUnicodeUtil.java   |  9 +++--
 .../automaton/TestStringsToAutomaton.java     |  2 ++
 4 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index f4716b4c64e4..fae6a25023b1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -477,11 +477,11 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
     int utf8Upto = utf8.offset;
     final byte[] bytes = utf8.bytes;
     final int utf8Limit = utf8.offset + utf8.length;
-    UTF8CodePointState state = new UTF8CodePointState();
+    UTF8CodePoint reuse = null;
     while (utf8Upto < utf8Limit) {
-      UTF8CodePointAt(bytes, utf8Upto, state);
-      ints[utf32Count++] = state.codePoint;
-      utf8Upto += state.codePointBytes;
+      reuse = codePointAt(bytes, utf8Upto, reuse);
+      ints[utf32Count++] = reuse.codePoint;
+      utf8Upto += reuse.codePointBytes;
     }
 
     return utf32Count;
@@ -489,27 +489,26 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
 
   /**
    * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
-   * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is
-   * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in
-   * this class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
+   * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
+   * class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
    * validation.
    *
    * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
    *     prematurely truncated.
    */
-  public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) {
-    UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state);
-  }
+  public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
+    if (reuse == null) {
+      reuse = new UTF8CodePoint();
+    }
 
-  private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) {
     int leadByte = utf8[pos] & 0xFF;
     int numBytes = utf8CodeLength[leadByte];
-    state.codePointBytes = numBytes;
+    reuse.codePointBytes = numBytes;
     int v;
     switch (numBytes) {
       case 1 -> {
-        state.codePoint = leadByte;
-        return;
+        reuse.codePoint = leadByte;
+        return reuse;
       }
       case 2 -> v = leadByte & 31; // 5 useful bits
       case 3 -> v = leadByte & 15; // 4 useful bits
@@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta
     while (pos < limit) {
       v = v << 6 | utf8[pos++] & 63;
     }
-    state.codePoint = v;
+    reuse.codePoint = v;
+
+    return reuse;
   }
 
-  /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */
-  public static final class UTF8CodePointState {
+  /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
+  public static final class UTF8CodePoint {
     public int codePoint;
     public int codePointBytes;
   }
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index a5e42b98e63e..82b2ce0fd834 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -288,30 +288,27 @@ private void add(BytesRef current, boolean asBinary) {
         : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
     assert setPrevious(current);
 
-    // Reusable state information if we're building a non-binary based automaton
-    UnicodeUtil.UTF8CodePointState scratchState = null;
-    if (asBinary == false) {
-      scratchState = new UnicodeUtil.UTF8CodePointState();
-    }
+    // Reusable codepoint information if we're building a non-binary based automaton
+    UnicodeUtil.UTF8CodePoint codePoint = null;
 
     // Descend in the automaton (find matching prefix).
-    int pos = 0, max = current.length;
+    byte[] bytes = current.bytes;
+    int pos = current.offset, max = current.offset + current.length;
     State next, state = root;
     if (asBinary) {
-      while (pos < max
-          && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) {
+      while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) {
         state = next;
         pos++;
       }
     } else {
       while (pos < max) {
-        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
-        next = state.lastChild(scratchState.codePoint);
+        codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        next = state.lastChild(codePoint.codePoint);
         if (next == null) {
           break;
         }
         state = next;
-        pos += scratchState.codePointBytes;
+        pos += codePoint.codePointBytes;
       }
     }
 
@@ -320,15 +317,14 @@ private void add(BytesRef current, boolean asBinary) {
     // Add suffix
     if (asBinary) {
       while (pos < max) {
-        state = state.newState(current.bytes[current.offset + pos] & 0xff);
+        state = state.newState(bytes[pos] & 0xff);
         pos++;
       }
     } else {
       while (pos < max) {
-        assert scratchState != null;
-        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
-        state = state.newState(scratchState.codePoint);
-        pos += scratchState.codePointBytes;
+        codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        state = state.newState(codePoint.codePoint);
+        pos += codePoint.codePointBytes;
       }
     }
     state.is_final = true;
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index 9a80f1ae2eca..dde8c2236f51 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -168,21 +168,20 @@ public void testUTF8toUTF32() {
 
   public void testUTF8CodePointAt() {
     int num = atLeast(50000);
-    UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState();
+    UnicodeUtil.UTF8CodePoint reuse = null;
     for (int i = 0; i < num; i++) {
       final String s = TestUtil.randomUnicodeString(random());
       final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
-      final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len);
 
       int[] expected = s.codePoints().toArray();
       int pos = 0;
       int expectedUpto = 0;
       while (pos < utf8Len) {
-        UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state);
-        assertEquals(expected[expectedUpto], state.codePoint);
+        reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
+        assertEquals(expected[expectedUpto], reuse.codePoint);
         expectedUpto++;
-        pos += state.codePointBytes;
+        pos += reuse.codePointBytes;
       }
       assertEquals(utf8Len, pos);
       assertEquals(expected.length, expectedUpto);
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 54a6d861775a..b9c86a775eca 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -103,12 +103,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
     CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
     ByteRunAutomaton runAutomaton = c.runAutomaton;
 
+    // Make sure every expected term is accepted
     for (BytesRef t : expected) {
       String readable = isBinary ? t.toString() : t.utf8ToString();
       assertTrue(
           readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
     }
 
+    // Make sure every term produced by the automaton is expected
     BytesRefBuilder scratch = new BytesRefBuilder();
     FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
     for (IntsRef r = it.next(); r != null; r = it.next()) {

From e7ce7e3fa685ec81c9e508df5f0f55c6c7da9162 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Fri, 26 May 2023 07:40:29 -0700
Subject: [PATCH 4/6] test basic minimization

---
 .../automaton/TestStringsToAutomaton.java     | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index b9c86a775eca..09b3730c88fd 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -43,6 +43,7 @@ public void testBasic() throws Exception {
 
     Automaton a = build(terms, false);
     checkAutomaton(terms, a, false);
+    checkMinimized(a);
   }
 
   public void testBasicBinary() throws Exception {
@@ -51,6 +52,35 @@ public void testBasicBinary() throws Exception {
 
     Automaton a = build(terms, true);
     checkAutomaton(terms, a, true);
+    checkMinimized(a);
+  }
+
+  public void testRandomMinimized() throws Exception {
+    int iters = RandomizedTest.isNightly() ? 20 : 5;
+    for (int i = 0; i < iters; i++) {
+      boolean buildBinary = random().nextBoolean();
+      int size = random().nextInt(2, 50);
+      Set<BytesRef> terms = new HashSet<>();
+      List<Automaton> automatonList = new ArrayList<>(size);
+      for (int j = 0; j < size; j++) {
+        if (buildBinary) {
+          BytesRef t = TestUtil.randomBinaryTerm(random(), 8);
+          terms.add(t);
+          automatonList.add(Automata.makeBinary(t));
+        } else {
+          String s = TestUtil.randomRealisticUnicodeString(random(), 8);
+          terms.add(newBytesRef(s));
+          automatonList.add(Automata.makeString(s));
+        }
+      }
+      List<BytesRef> sortedTerms = terms.stream().sorted().toList();
+
+      Automaton expected =
+          MinimizationOperations.minimize(
+              Operations.union(automatonList), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+      Automaton actual = build(sortedTerms, buildBinary);
+      assertSameAutomaton(expected, actual);
+    }
   }
 
   public void testRandomUnicodeOnly() throws Exception {
@@ -119,6 +149,18 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
     }
   }
 
+  private void checkMinimized(Automaton a) {
+    Automaton minimized =
+        MinimizationOperations.minimize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+    assertSameAutomaton(minimized, a);
+  }
+
+  private static void assertSameAutomaton(Automaton a, Automaton b) {
+    assertEquals(a.getNumStates(), b.getNumStates());
+    assertEquals(a.getNumTransitions(), b.getNumTransitions());
+    assertTrue(Operations.sameLanguage(a, b));
+  }
+
   private List<BytesRef> basicTerms() {
     List<BytesRef> terms = new ArrayList<>();
     terms.add(newBytesRef("dog"));
@@ -137,7 +179,7 @@ private Automaton build(Collection<BytesRef> terms, boolean asBinary) throws IOE
     }
   }
 
-  private static class TermIterator implements BytesRefIterator {
+  private static final class TermIterator implements BytesRefIterator {
     private final Iterator<BytesRef> it;
 
     TermIterator(Collection<BytesRef> terms) {

From 84061eee79591bc959636d31b75b70c2d6f876a2 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Fri, 26 May 2023 08:49:58 -0700
Subject: [PATCH 5/6] addressing some feedback

---
 .../java/org/apache/lucene/util/UnicodeUtil.java  | 15 +++++++--------
 .../lucene/util/automaton/StringsToAutomaton.java |  4 ++--
 .../org/apache/lucene/util/TestUnicodeUtil.java   |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index fae6a25023b1..bb6378c9656b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -481,7 +481,7 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
     while (utf8Upto < utf8Limit) {
       reuse = codePointAt(bytes, utf8Upto, reuse);
       ints[utf32Count++] = reuse.codePoint;
-      utf8Upto += reuse.codePointBytes;
+      utf8Upto += reuse.numBytes;
     }
 
     return utf32Count;
@@ -491,10 +491,8 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
    * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
    * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
    * class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
-   * validation.
-   *
-   * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
-   *     prematurely truncated.
+   * validation. Passing invalid UTF8 or a position that is not a valid header byte position may
+   * result in undefined behavior. This makes no attempt to synchronize or validate.
    */
   public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
     if (reuse == null) {
@@ -503,7 +501,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus
 
     int leadByte = utf8[pos] & 0xFF;
     int numBytes = utf8CodeLength[leadByte];
-    reuse.codePointBytes = numBytes;
+    reuse.numBytes = numBytes;
     int v;
     switch (numBytes) {
       case 1 -> {
@@ -513,7 +511,8 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus
       case 2 -> v = leadByte & 31; // 5 useful bits
       case 3 -> v = leadByte & 15; // 4 useful bits
       case 4 -> v = leadByte & 7; // 3 useful bits
-      default -> throw new IllegalArgumentException("invalid utf8");
+      default -> throw new IllegalArgumentException(
+          "Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte));
     }
 
     // TODO: this may read past utf8's limit.
@@ -530,7 +529,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus
   /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
   public static final class UTF8CodePoint {
     public int codePoint;
-    public int codePointBytes;
+    public int numBytes;
   }
 
   /** Shift value for lead surrogate to form a supplementary character. */
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 82b2ce0fd834..4ab20328e2df 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -308,7 +308,7 @@ private void add(BytesRef current, boolean asBinary) {
           break;
         }
         state = next;
-        pos += codePoint.codePointBytes;
+        pos += codePoint.numBytes;
       }
     }
 
@@ -324,7 +324,7 @@ private void add(BytesRef current, boolean asBinary) {
       while (pos < max) {
         codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
         state = state.newState(codePoint.codePoint);
-        pos += codePoint.codePointBytes;
+        pos += codePoint.numBytes;
       }
     }
     state.is_final = true;
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index dde8c2236f51..2f309e945b74 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -181,7 +181,7 @@ public void testUTF8CodePointAt() {
         reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
         assertEquals(expected[expectedUpto], reuse.codePoint);
         expectedUpto++;
-        pos += reuse.codePointBytes;
+        pos += reuse.numBytes;
       }
       assertEquals(utf8Len, pos);
       assertEquals(expected.length, expectedUpto);

From 485ebc82ce1f6fc923cad05a7b619b3a2e554c66 Mon Sep 17 00:00:00 2001
From: Greg Miller <gsmiller@gmail.com>
Date: Fri, 26 May 2023 10:03:22 -0700
Subject: [PATCH 6/6] fixup rebase

---
 .../lucene/util/automaton/StringsToAutomaton.java | 15 ++-------------
 .../util/automaton/TestStringsToAutomaton.java    |  2 +-
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index 4ab20328e2df..6c66dc6fd9ed 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -32,11 +32,9 @@
  * algorithm described in <a href="https://aclanthology.org/J00-1002.pdf">Incremental Construction
  * of Minimal Acyclic Finite-State Automata by Daciuk, Mihov, Watson and Watson</a>. This requires
  * sorted input data, but is very fast (nearly linear with the input size). Also offers the ability
- * to directly build a binary {@link Automaton} representation.
+ * to directly build a binary {@link Automaton} representation. Users should access this
+ * functionality through {@link Automata} static methods.
  *
- * @see #build(Collection)
- * @see #build(Collection, boolean)
- * @see #build(BytesRefIterator, boolean)
  * @see Automata#makeStringUnion(Collection)
  * @see Automata#makeBinaryStringUnion(Collection)
  * @see Automata#makeStringUnion(BytesRefIterator)
@@ -234,15 +232,6 @@ private Automaton completeAndConvert() {
     return a.finish();
   }
 
-  /**
-   * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
-   * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with UTF-8
-   * codepoints as transition labels.
-   */
-  static Automaton build(Collection<BytesRef> input) {
-    return build(input, false);
-  }
-
   /**
    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
    * strings in UTF-8. These strings must be binary-sorted. Creates an {@link Automaton} with either
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 09b3730c88fd..f093623c3702 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -102,7 +102,7 @@ public void testLargeTerms() throws Exception {
         e.getMessage()
             .startsWith(
                 "This builder doesn't allow terms that are larger than "
-                    + StringsToAutomaton.MAX_TERM_LENGTH
+                    + Automata.MAX_STRING_UNION_TERM_LENGTH
                     + " characters"));
 
     byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);