Skip to content

Commit

Permalink
Add grapheme cluster handling tests
Browse files Browse the repository at this point in the history
GraphemeBreakTest.txt was downloaded from the Unicode Character Database [0].

Changes to build.gradle.kts were required to stop `gradlew test` from
regenerating the resources with empty JSON objects. And adding a
dependency.

[0]: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
  • Loading branch information
ludwig-jb authored and AlexPl292 committed Aug 14, 2023
1 parent 41177b9 commit 068d610
Show file tree
Hide file tree
Showing 5 changed files with 809 additions and 10 deletions.
11 changes: 10 additions & 1 deletion vim-engine/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ plugins {
`maven-publish`
}

val kotlinVersion: String by project

// group 'org.jetbrains.ideavim'
// version 'SNAPSHOT'

Expand All @@ -27,10 +29,17 @@ ksp {
arg("ex_commands_file", "engine_ex_commands.json")
}

afterEvaluate {
tasks.named("kspTestKotlin").configure { enabled = false }
}

dependencies {
testImplementation("org.junit.jupiter:junit-jupiter-api:5.9.2")
testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.9.2")
compileOnly("org.jetbrains.kotlin:kotlin-stdlib:1.8.21")

// https://mvnrepository.com/artifact/org.jetbrains.kotlin/kotlin-test
testImplementation("org.jetbrains.kotlin:kotlin-test:$kotlinVersion")
compileOnly("org.jetbrains.kotlin:kotlin-stdlib:$kotlinVersion")

compileOnly("org.jetbrains:annotations:24.0.1")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ internal fun isExtendedPictographic(codePoint: Int): Boolean {
// A bitmap that maps a code point into whether it has the Extended_Pictographic property.
// The code points go in increasing order by index and in reverse order by bit in a specific long.
// This way a simple divmod is enough to compute both indices.
//
// The bitmap is generated from the emoji-data.txt of the Unicode Character Database:
// https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
private val bitmap = longArrayOf(
0, 0, 72567767433216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Expand Down Expand Up @@ -78,4 +81,5 @@ private val bitmap = longArrayOf(
274877906943, -577445914654736386, -512, -1, -1, -1, -1, -1, 576460752303423487, -1, -1, -1, -1,
4611686018427387903, -64, -1, -1, -1, 65535, -1, -1, 0, -4503599627370496, 0, -2097152, 61440, 4227923712,
-70368744112384, -1, -576460752303427584, -65, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, 4611686018427387903)
-1, -1, -1, -1, -1, -1, -1, 4611686018427387903
)
Original file line number Diff line number Diff line change
@@ -1,13 +1,53 @@
package com.maddyhome.idea.vim.common

//
// RATIONALE:
//
// As an alternative to implementing this ourselves, we could make use of the ICU4J — the implementation
// of the unicode related properties and algorithms maintained by the unicode organisation.
//
// The reason why it wasn't done so is twofold:
// 1. ICU4J is a fairly big library, that provides the complete support for the Unicode specification,
// while at this moment all we need is a small subset of that functionality, namely the grapheme
// cluster boundaries search.
//
// 2. The exposed API is a little awkward to use and would require adapters to be efficient.
// To iterate over the grapheme cluster boundaries using ICU4J, one could employ the implementation
// of the `java.text.BreakIterator` provided by the library.
//
// Given the specifics of the memory access patterns in the vim-engine with respect to grapheme
// cluster boundaries search (random access, only certain commands will ever care about the boundaries),
// using the sequential `BreakIterator` isn't very efficient. It also does quite a bit of extra
// work to enable faster random access of the areas that were visited before by maintaining a partial
// index.
//
// JDK21 exposes a similar implementation of the `java.text.BreakIterator` interface, except it's
// even less efficent: upon the iterator creation, the full traversal of the given text is performed
// to build a full index over all the grapheme cluster boundaries, making the use of the said iterator
// O(n) both by time and memory.
//
// We could still consider that option just for the sake of reducing the amount of code that we have
// to maintain, once JDK21 is released. The ineffiency can be reduced by taking a small fragment of
// the text and increase the size if no boundaries were found, although the amount of extra work
// is still substantial.
//
// Ironically, one of the packages of the JDK21 has almost exactly what we need (a very similar API
// to the one implemented here), but it is an internal package (`jdk.internal.util.regex`).
//

/**
* Move over unicode extended grapheme cluster boundaries.
*
* https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
*/
public object Graphemes {
/**
* Returns the next extended grapheme cluster boundary or `null` if the end of text has been reached.
*/
public fun next(charSeq: CharSequence, start: Int): Int? {
if (start < 0 || start >= charSeq.length) return null
require(start >= 0) { "'start' is out of bounds." }

if (start >= charSeq.length) return null

return charSeq.nextBoundary(
start,
Expand All @@ -18,16 +58,20 @@ public object Graphemes {
)
}

/**
* Returns the previous extended grapheme cluster boundary or `null` if the start of text has been reached.
*/
public fun prev(charSeq: CharSequence, start: Int): Int? {
if (start <= 0 || start > charSeq.length) return null
require(start <= charSeq.length) { "'start' is out of bounds" }

if (start <= 0) return null

return charSeq.nextBoundary(
start,
next = Int::minus,
nextCode = { if (it > 0) Character.codePointBefore(this, it) else null },
prevCode = { current, charCount ->
if (current - charCount > 0)
Character.codePointBefore(this, current - charCount)
if (current - charCount > 0) Character.codePointBefore(this, current - charCount)
else null
},
swap = true,
Expand Down Expand Up @@ -68,12 +112,14 @@ private inline fun CharSequence.nextBoundary(
}

// GB4 - break after Control, CR or LF.
if (type in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF))
if (type in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) {
return next(current, charCount)
}

// GB5 - break before Control, CR or LF.
if (nextType in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF))
if (nextType in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) {
return next(current, charCount)
}

// GB6 - do not break Hangul syllable sequence.
if (type == CodePointType.L && nextType in arrayOf(
Expand Down Expand Up @@ -157,8 +203,9 @@ private inline fun CharSequence.countPrev(start: Int, crossinline pred: (Int) ->
var count = 0
while (current > 0) {
val codePoint = Character.codePointBefore(this, current)
if (!pred(codePoint))
if (!pred(codePoint)) {
break
}
current -= Character.charCount(codePoint)
count++
}
Expand All @@ -178,8 +225,9 @@ private fun classify(codePoint: Int): CodePointType {
in 0 until 0x80 -> return CodePointType.OTHER
}

if (isExtendedPictographic(codePoint))
if (isExtendedPictographic(codePoint)) {
return CodePointType.EXTENDED_PICTOGRAPHIC
}

val type = Character.getType(codePoint).toByte()
return when (type) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package com.maddyhome.idea.vim.common

import java.nio.file.Files
import java.nio.file.Paths
import org.junit.jupiter.api.Test
import kotlin.math.max
import kotlin.math.min
import kotlin.test.assertEquals

class GraphemesTest {
companion object {
/** Extracts the text before the comment symbol '#'. */
val withoutCommentRegex = Regex("""^(.*?)#.*""")
}

@Test
fun `test next() against UCDs GraphemeBreakTest_txt`() {
val testCases = parseGraphemeBreakTestCases(resource("GraphemeBreakTest.txt"))

for ((i, testCase) in testCases.withIndex()) {
val actualGraphemes = graphemes(testCase.string, start = 0, next = Graphemes::next)

assertEquals(testCase.graphemes, actualGraphemes, "test case #$i")
}
}

@Test
fun `test prev() against UCDs GraphemeBreakTest_txt`() {
val testCases = parseGraphemeBreakTestCases(resource("GraphemeBreakTest.txt"))

for ((i, testCase) in testCases.withIndex()) {
val actualGraphemes = graphemes(testCase.string, start = testCase.string.length, next = Graphemes::prev)

assertEquals(testCase.graphemes.reversed(), actualGraphemes, "test case #$i")
}
}

/** Breaks a string into a list of grapheme clusters using the testee class ([Graphemes]). */
private fun graphemes(text: String, start: Int, next: (CharSequence, Int) -> Int?): List<String> {
var boundary = start
val graphemes = mutableListOf<String>()
while (true) {
val nextBoundary = next(text, boundary) ?: break

// Since we may traverse in both directions, we should properly get the grapheme range.
val from = min(boundary, nextBoundary)
val to = max(boundary, nextBoundary)

graphemes.add(text.substring(from, to))
boundary = nextBoundary
}
return graphemes
}

private fun parseGraphemeBreakTestCases(contents: String) = contents.lines().mapNotNull { parseTestCase(it) }

/**
* Parses a single test case.
*
* The test cases are presented as a sequence of code points in the following format:
* ÷ 034F × 0308 ÷ 0020 ÷
* Where the "÷" symbol represents a break (including start of text and end of text breaks)
* and the "×" symbol means that the two adjacent code points are part of the same grapheme cluster.
* Each code point is encoded as a hexadecimal.
*/
private fun parseTestCase(line: String): TestCase? {
val match = withoutCommentRegex.find(line) ?: return null
val groups = match.groupValues
if (groups.size != 2) return null

val breakChar = '÷'
val joinChar = '×'

val testBody = groups[1].trim()
if (testBody.isEmpty()) return null

val composites = testBody.split(breakChar)
.filter { it.isNotBlank() }
.map { it.trim() }

val compositesStrings = composites
.map { composite ->
composite
.split(joinChar)
.filter { it.isNotBlank() }.joinToString(separator = "") { codePoint ->
String(
Character.toChars(
Integer.parseInt(codePoint.trim(), 16)
)
)
}
}

return TestCase(string = compositesStrings.joinToString(separator = ""), graphemes = compositesStrings)
}

private fun resource(name: String): String {
val resourceUrl = javaClass.classLoader.getResource(name)
?: error("resource `$name' wasn't found")

return Files.readString(Paths.get(resourceUrl.toURI()))
}

private data class TestCase(
val string: String,
val graphemes: List<String>
)
}
Loading

0 comments on commit 068d610

Please sign in to comment.