Add grapheme cluster handling tests

GraphemeBreakTest.txt was downloaded from the Unicode Character Database [0]. Changes to build.gradle.kts were required to stop `gradlew test` from regenerating the resources with empty JSON objects. And adding a dependency. [0]: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
JetBrains · Aug 14, 2023 · 068d610 · 068d610
1 parent 41177b9
commit 068d610
Show file tree

Hide file tree

Showing 5 changed files with 809 additions and 10 deletions.
diff --git a/vim-engine/build.gradle.kts b/vim-engine/build.gradle.kts
@@ -14,6 +14,8 @@ plugins {
   `maven-publish`
 }
 
+val kotlinVersion: String by project
+
 // group 'org.jetbrains.ideavim'
 // version 'SNAPSHOT'
 
@@ -27,10 +29,17 @@ ksp {
   arg("ex_commands_file", "engine_ex_commands.json")
 }
 
+afterEvaluate {
+    tasks.named("kspTestKotlin").configure { enabled = false }
+}
+
 dependencies {
     testImplementation("org.junit.jupiter:junit-jupiter-api:5.9.2")
     testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.9.2")
-    compileOnly("org.jetbrains.kotlin:kotlin-stdlib:1.8.21")
+
+    // https://mvnrepository.com/artifact/org.jetbrains.kotlin/kotlin-test
+    testImplementation("org.jetbrains.kotlin:kotlin-test:$kotlinVersion")
+    compileOnly("org.jetbrains.kotlin:kotlin-stdlib:$kotlinVersion")
 
     compileOnly("org.jetbrains:annotations:24.0.1")
 

diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt
@@ -19,6 +19,9 @@ internal fun isExtendedPictographic(codePoint: Int): Boolean {
 // A bitmap that maps a code point into whether it has the Extended_Pictographic property.
 // The code points go in increasing order by index and in reverse order by bit in a specific long.
 // This way a simple divmod is enough to compute both indices.
+//
+// The bitmap is generated from the emoji-data.txt of the Unicode Character Database:
+// https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
 private val bitmap = longArrayOf(
   0, 0, 72567767433216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -78,4 +81,5 @@ private val bitmap = longArrayOf(
   274877906943, -577445914654736386, -512, -1, -1, -1, -1, -1, 576460752303423487, -1, -1, -1, -1,
   4611686018427387903, -64, -1, -1, -1, 65535, -1, -1, 0, -4503599627370496, 0, -2097152, 61440, 4227923712,
   -70368744112384, -1, -576460752303427584, -65, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-  -1, -1, -1, -1, -1, -1, -1, 4611686018427387903)
+  -1, -1, -1, -1, -1, -1, -1, 4611686018427387903
+)
diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt
@@ -1,13 +1,53 @@
 package com.maddyhome.idea.vim.common
 
+//
+// RATIONALE:
+//
+// As an alternative to implementing this ourselves, we could make use of the ICU4J — the implementation
+// of the unicode related properties and algorithms maintained by the unicode organisation.
+//
+// The reason why it wasn't done so is twofold:
+//   1. ICU4J is a fairly big library, that provides the complete support for the Unicode specification,
+//      while at this moment all we need is a small subset of that functionality, namely the grapheme
+//      cluster boundaries search.
+//
+//   2. The exposed API is a little awkward to use and would require adapters to be efficient.
+//      To iterate over the grapheme cluster boundaries using ICU4J, one could employ the implementation
+//      of the `java.text.BreakIterator` provided by the library.
+//
+//      Given the specifics of the memory access patterns in the vim-engine with respect to grapheme
+//      cluster boundaries search (random access, only certain commands will ever care about the boundaries),
+//      using the sequential `BreakIterator` isn't very efficient. It also does quite a bit of extra
+//      work to enable faster random access of the areas that were visited before by maintaining a partial
+//      index.
+//
+//      JDK21 exposes a similar implementation of the `java.text.BreakIterator` interface, except it's
+//      even less efficent: upon the iterator creation, the full traversal of the given text is performed
+//      to build a full index over all the grapheme cluster boundaries, making the use of the said iterator
+//      O(n) both by time and memory.
+//
+//      We could still consider that option just for the sake of reducing the amount of code that we have
+//      to maintain, once JDK21 is released. The ineffiency can be reduced by taking a small fragment of
+//      the text and increase the size if no boundaries were found, although the amount of extra work
+//      is still substantial.
+//
+//      Ironically, one of the packages of the JDK21 has almost exactly what we need (a very similar API
+//      to the one implemented here), but it is an internal package (`jdk.internal.util.regex`).
+//
+
 /**
  * Move over unicode extended grapheme cluster boundaries.
  *
  * https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
  */
 public object Graphemes {
+  /**
+   * Returns the next extended grapheme cluster boundary or `null` if the end of text has been reached.
+   */
   public fun next(charSeq: CharSequence, start: Int): Int? {
-    if (start < 0 || start >= charSeq.length) return null
+    require(start >= 0) { "'start' is out of bounds." }
+
+    if (start >= charSeq.length) return null
 
     return charSeq.nextBoundary(
       start,
@@ -18,16 +58,20 @@ public object Graphemes {
     )
   }
 
+  /**
+   * Returns the previous extended grapheme cluster boundary or `null` if the start of text has been reached.
+   */
   public fun prev(charSeq: CharSequence, start: Int): Int? {
-    if (start <= 0 || start > charSeq.length) return null
+    require(start <= charSeq.length) { "'start' is out of bounds" }
+
+    if (start <= 0) return null
 
     return charSeq.nextBoundary(
       start,
       next = Int::minus,
       nextCode = { if (it > 0) Character.codePointBefore(this, it) else null },
       prevCode = { current, charCount ->
-        if (current - charCount > 0)
-          Character.codePointBefore(this, current - charCount)
+        if (current - charCount > 0) Character.codePointBefore(this, current - charCount)
         else null
       },
       swap = true,
@@ -68,12 +112,14 @@ private inline fun CharSequence.nextBoundary(
     }
 
     // GB4 - break after Control, CR or LF.
-    if (type in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF))
+    if (type in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) {
       return next(current, charCount)
+    }
 
     // GB5 - break before Control, CR or LF.
-    if (nextType in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF))
+    if (nextType in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) {
       return next(current, charCount)
+    }
 
     // GB6 - do not break Hangul syllable sequence.
     if (type == CodePointType.L && nextType in arrayOf(
@@ -157,8 +203,9 @@ private inline fun CharSequence.countPrev(start: Int, crossinline pred: (Int) ->
   var count = 0
   while (current > 0) {
     val codePoint = Character.codePointBefore(this, current)
-    if (!pred(codePoint))
+    if (!pred(codePoint)) {
       break
+    }
     current -= Character.charCount(codePoint)
     count++
   }
@@ -178,8 +225,9 @@ private fun classify(codePoint: Int): CodePointType {
     in 0 until 0x80 -> return CodePointType.OTHER
   }
 
-  if (isExtendedPictographic(codePoint))
+  if (isExtendedPictographic(codePoint)) {
     return CodePointType.EXTENDED_PICTOGRAPHIC
+  }
 
   val type = Character.getType(codePoint).toByte()
   return when (type) {

diff --git a/vim-engine/src/test/kotlin/com/maddyhome/idea/vim/common/GraphemesTest.kt b/vim-engine/src/test/kotlin/com/maddyhome/idea/vim/common/GraphemesTest.kt
@@ -0,0 +1,108 @@
+package com.maddyhome.idea.vim.common
+
+import java.nio.file.Files
+import java.nio.file.Paths
+import org.junit.jupiter.api.Test
+import kotlin.math.max
+import kotlin.math.min
+import kotlin.test.assertEquals
+
+class GraphemesTest {
+  companion object {
+    /** Extracts the text before the comment symbol '#'. */
+    val withoutCommentRegex = Regex("""^(.*?)#.*""")
+  }
+
+  @Test
+  fun `test next() against UCDs GraphemeBreakTest_txt`() {
+    val testCases = parseGraphemeBreakTestCases(resource("GraphemeBreakTest.txt"))
+
+    for ((i, testCase) in testCases.withIndex()) {
+      val actualGraphemes = graphemes(testCase.string, start = 0, next = Graphemes::next)
+
+      assertEquals(testCase.graphemes, actualGraphemes, "test case #$i")
+    }
+  }
+
+  @Test
+  fun `test prev() against UCDs GraphemeBreakTest_txt`() {
+    val testCases = parseGraphemeBreakTestCases(resource("GraphemeBreakTest.txt"))
+
+    for ((i, testCase) in testCases.withIndex()) {
+      val actualGraphemes = graphemes(testCase.string, start = testCase.string.length, next = Graphemes::prev)
+
+      assertEquals(testCase.graphemes.reversed(), actualGraphemes, "test case #$i")
+    }
+  }
+
+  /** Breaks a string into a list of grapheme clusters using the testee class ([Graphemes]). */
+  private fun graphemes(text: String, start: Int, next: (CharSequence, Int) -> Int?): List<String> {
+    var boundary = start
+    val graphemes = mutableListOf<String>()
+    while (true) {
+      val nextBoundary = next(text, boundary) ?: break
+
+      // Since we may traverse in both directions, we should properly get the grapheme range.
+      val from = min(boundary, nextBoundary)
+      val to = max(boundary, nextBoundary)
+
+      graphemes.add(text.substring(from, to))
+      boundary = nextBoundary
+    }
+    return graphemes
+  }
+
+  private fun parseGraphemeBreakTestCases(contents: String) = contents.lines().mapNotNull { parseTestCase(it) }
+
+  /**
+   * Parses a single test case.
+   * 
+   * The test cases are presented as a sequence of code points in the following format:
+   *     ÷ 034F × 0308 ÷ 0020 ÷
+   * Where the "÷" symbol represents a break (including start of text and end of text breaks)
+   * and the "×" symbol means that the two adjacent code points are part of the same grapheme cluster.
+   * Each code point is encoded as a hexadecimal.
+   */
+  private fun parseTestCase(line: String): TestCase? {
+    val match = withoutCommentRegex.find(line) ?: return null
+    val groups = match.groupValues
+    if (groups.size != 2) return null
+
+    val breakChar = '÷'
+    val joinChar = '×'
+
+    val testBody = groups[1].trim()
+    if (testBody.isEmpty()) return null
+
+    val composites = testBody.split(breakChar)
+      .filter { it.isNotBlank() }
+      .map { it.trim() }
+
+    val compositesStrings = composites
+      .map { composite ->
+        composite
+          .split(joinChar)
+          .filter { it.isNotBlank() }.joinToString(separator = "") { codePoint ->
+            String(
+              Character.toChars(
+                Integer.parseInt(codePoint.trim(), 16)
+              )
+            )
+          }
+      }
+
+    return TestCase(string = compositesStrings.joinToString(separator = ""), graphemes = compositesStrings)
+  }
+
+  private fun resource(name: String): String {
+    val resourceUrl = javaClass.classLoader.getResource(name)
+      ?: error("resource `$name' wasn't found")
+
+    return Files.readString(Paths.get(resourceUrl.toURI()))
+  }
+
+  private data class TestCase(
+    val string: String,
+    val graphemes: List<String>
+  )
+}