From 41177b91bee2b8bafd232f0fba643bf839493d69 Mon Sep 17 00:00:00 2001 From: Ludwig Valda Vasquez Date: Mon, 24 Jul 2023 17:12:43 +0200 Subject: [PATCH] Handle unicode grapheme clusters Some characters that render as a single symbol can span over a sequence of several unicode code points (e.g., flag emojis, combination of a letter and a diacritic, Hangul syllables, etc.). Such composites are called grapheme clusters in the unicode standard, and this patch introduces recognition of extended grapheme cluster boundaries, allowing to iterate over rendered characters. Without this, user may observe the cursor being "stuck" inside a character for several keystrokes, while it's making its way through each code point in the grapheme cluster. The implementation follows the boundaries search algorithm outlined in the technical report 29 of the Unicode standard[1]. The implementation was tested against the set of test cases provided by the unicode character database[2]. Additionally to the grapheme cluster boundaries search itself, this patch adds `isExtendedPictographic` function, that answers whether the given code point has a unicode "Extended_Pictographic" property, which is required to correctly determine grapheme cluster boundaries. This method is implemented natively in the JDK 21 and can be removed once we start targeting that version. Extended_Pictographic property is stored as a bitmap. I was considering making a similar map for the code point classification in the grapheme cluster boundary search implementation, which could yield better performance, but that would require adding another half a megabyte (at least) of data into the JAR and I've settled for the bunch of `if`s way. That is something that can be reconsidered and shouldn't be difficult to change if the impact on performance would be noticeable (in my simple tests it didn't show). A few functions in the vim-engine were adjusted to handle grapheme clusters (such as getting the horizontal offset and adjusting the cursor to not reach over the end of the line). [1]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries [2]: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt --- .../motion/leftright/MotionEndActionTest.kt | 23 ++ .../motion/leftright/MotionLeftActionTest.kt | 16 ++ .../motion/leftright/MotionRightActionTest.kt | 26 ++ .../idea/vim/api/EngineEditorHelper.kt | 8 +- .../idea/vim/api/VimMotionGroupBase.kt | 23 +- .../idea/vim/common/ExtendedPictographics.kt | 81 ++++++ .../maddyhome/idea/vim/common/Graphemes.kt | 255 ++++++++++++++++++ 7 files changed, 418 insertions(+), 14 deletions(-) create mode 100644 vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt create mode 100644 vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt diff --git a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionEndActionTest.kt b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionEndActionTest.kt index d5243d2a2f..3dd96195d5 100644 --- a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionEndActionTest.kt +++ b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionEndActionTest.kt @@ -159,4 +159,27 @@ class MotionEndActionTest : VimTestCase() { """.trimIndent() doTest(keys, before, after, VimStateMachine.Mode.COMMAND, VimStateMachine.SubMode.NONE) } + + @TestWithoutNeovim(SkipNeovimReason.NON_ASCII) + @OptionTest(VimOption(TestOptionConstants.keymodel, doesntAffectTest = true)) + fun `test motion end with multiple code point grapheme cluster at the end`() { + val keys = listOf("") + val before = """ + Lorem Ipsum + + I found it in ${c}a legendary landπŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§ + consectetur adipiscing elit + Sed in orci mauris. + Cras id tellus in ex imperdiet egestas. + """.trimIndent() + val after = """ + Lorem Ipsum + + I found it in a legendary land${c}πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§ + consectetur adipiscing elit + Sed in orci mauris. + Cras id tellus in ex imperdiet egestas. + """.trimIndent() + doTest(keys, before, after, VimStateMachine.Mode.COMMAND, VimStateMachine.SubMode.NONE) + } } diff --git a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionLeftActionTest.kt b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionLeftActionTest.kt index abc9bb86e1..ef48c1fffa 100644 --- a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionLeftActionTest.kt +++ b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionLeftActionTest.kt @@ -102,4 +102,20 @@ class MotionLeftActionTest : VimTestCase() { enterCommand("set whichwrap=h") } } + + @TestWithoutNeovim(SkipNeovimReason.NON_ASCII) + @Test + fun `test simple motion multiple code point grapheme cluster`() { + doTest( + "h", + """ + Oh, hi Mark + You are myπŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§${c} favourite customer + """.trimIndent(), + """ + Oh, hi Mark + You are my${c}πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§ favourite customer + """.trimIndent(), + ) + } } diff --git a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionRightActionTest.kt b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionRightActionTest.kt index af379bf7e9..0b44a81db2 100644 --- a/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionRightActionTest.kt +++ b/src/test/java/org/jetbrains/plugins/ideavim/action/motion/leftright/MotionRightActionTest.kt @@ -189,6 +189,32 @@ class MotionRightActionTest : VimTestCase() { ) } + @TestWithoutNeovim(SkipNeovimReason.NON_ASCII) + @OptionTest(VimOption(TestOptionConstants.virtualedit, doesntAffectTest = true)) + fun `test simple motion multiple code point grapheme cluster`() { + doTest( + "l", + """ + Lorem Ipsum + + I found it in a legendar${c}πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§ land + consectetur adipiscing elit + Sed in orci mauris. + Cras id tellus in ex imperdiet egestas. + """.trimIndent(), + """ + Lorem Ipsum + + I found it in a legendarπŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§${c} land + consectetur adipiscing elit + Sed in orci mauris. + Cras id tellus in ex imperdiet egestas. + """.trimIndent(), + VimStateMachine.Mode.COMMAND, + VimStateMachine.SubMode.NONE, + ) + } + @TestWithoutNeovim(SkipNeovimReason.NON_ASCII) @OptionTest(VimOption(TestOptionConstants.virtualedit, doesntAffectTest = true)) fun `test simple motion czech`() { diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/EngineEditorHelper.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/EngineEditorHelper.kt index b0412e9152..fc74be7d4a 100644 --- a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/EngineEditorHelper.kt +++ b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/EngineEditorHelper.kt @@ -8,6 +8,7 @@ package com.maddyhome.idea.vim.api +import com.maddyhome.idea.vim.common.Graphemes import com.maddyhome.idea.vim.common.TextRange import java.nio.CharBuffer @@ -146,7 +147,12 @@ public fun VimEditor.getLineEndOffset(line: Int, allowEnd: Boolean): Int { } else { val startOffset: Int = getLineStartOffset(line) val endOffset: Int = getLineEndOffset(line) - endOffset - if (startOffset == endOffset || allowEnd) 0 else 1 + + if (startOffset == endOffset || allowEnd) { + endOffset + } else { + Graphemes.prev(text(), endOffset) ?: endOffset + } } } diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/VimMotionGroupBase.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/VimMotionGroupBase.kt index ea3d439713..a57c5f16e4 100644 --- a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/VimMotionGroupBase.kt +++ b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/api/VimMotionGroupBase.kt @@ -12,6 +12,7 @@ import com.maddyhome.idea.vim.action.motion.leftright.TillCharacterMotionType import com.maddyhome.idea.vim.command.Argument import com.maddyhome.idea.vim.command.MotionType import com.maddyhome.idea.vim.command.OperatorArguments +import com.maddyhome.idea.vim.common.Graphemes import com.maddyhome.idea.vim.common.TextRange import com.maddyhome.idea.vim.handler.Motion import com.maddyhome.idea.vim.handler.Motion.AbsoluteOffset @@ -23,6 +24,7 @@ import com.maddyhome.idea.vim.helper.isEndAllowed import com.maddyhome.idea.vim.helper.isEndAllowedIgnoringOnemore import com.maddyhome.idea.vim.helper.mode import kotlin.math.abs +import kotlin.math.absoluteValue import kotlin.math.min import kotlin.math.sign @@ -108,21 +110,16 @@ public abstract class VimMotionGroupBase : VimMotionGroup { allowPastEnd: Boolean, allowWrap: Boolean, ): Motion { - val oldOffset = caret.offset.point - var diff = 0 val text = editor.text() - val sign = sign(count.toFloat()).toInt() - for (pointer in IntProgression.fromClosedRange(0, count - sign, sign)) { - val textPointer = oldOffset + pointer - diff += if (textPointer < text.length && textPointer >= 0) { - // Actual char size can differ from 1 if unicode characters are used (like πŸ”) - Character.charCount(Character.codePointAt(text, textPointer)) - } else { - 1 - } + val oldOffset = caret.offset.point + var current = oldOffset + for (i in 0 until count.absoluteValue) { + val newOffset = if (count > 0) Graphemes.next(text, current) else Graphemes.prev(text, current) + current = newOffset ?: break } + val offset = if (allowWrap) { - var newOffset = oldOffset + sign * diff + var newOffset = current val oldLine = editor.offsetToBufferPosition(oldOffset).line val newLine = editor.offsetToBufferPosition(newOffset).line if (!allowPastEnd && count > 0 && oldLine == newLine && newOffset == editor.getLineEndForOffset(newOffset)) { @@ -130,7 +127,7 @@ public abstract class VimMotionGroupBase : VimMotionGroup { } editor.normalizeOffset(newOffset, allowPastEnd) } else { - editor.normalizeOffset(caret.getLine().line, oldOffset + (sign * diff), allowPastEnd) + editor.normalizeOffset(caret.getLine().line, current, allowPastEnd) } return offset.toMotionOrError() diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt new file mode 100644 index 0000000000..f6e2546212 --- /dev/null +++ b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/ExtendedPictographics.kt @@ -0,0 +1,81 @@ +package com.maddyhome.idea.vim.common + +/** + * Answers whether a given code point is a unicode Extended_Pictographic. + * + * NOTE: this is a part of the Java 21 API. Can be removed once we start targeting that version. + */ +internal fun isExtendedPictographic(codePoint: Int): Boolean { + // Outside of the bitmap. + if (codePoint >= bitmap.size * 64) return false + + val idx = codePoint / 64 + val bit = codePoint % 64 + val bucket = bitmap[idx] + + return (bucket and (1L shl bit)) != 0L +} + +// A bitmap that maps a code point into whether it has the Extended_Pictographic property. +// The code points go in increasing order by index and in reverse order by bit in a specific long. +// This way a simple divmod is enough to compute both indices. +private val bitmap = longArrayOf( + 0, 0, 72567767433216, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1152921504606846976, 512, 0, 0, 144115205255725056, 0, + 6597135826944, 0, 0, 0, 0, 0, 1099712954368, 0, 256, 508904558869643264, 0, 0, 0, 4, 0, 0, 18027592649015296, + 8646911284551352321, -524353, -1, -65473, -1, 6756508085255999, 1065163968656, -9223090553273450496, 0, 0, 0, 0, 0, + 13510798882111488, 0, 0, 0, 0, 0, 0, 0, 402653408, 2162688, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2306124484190404608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41943040, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 140737488412672, -4610577710706589696, -35184237985792, + 274877906943, -577445914654736386, -512, -1, -1, -1, -1, -1, 576460752303423487, -1, -1, -1, -1, + 4611686018427387903, -64, -1, -1, -1, 65535, -1, -1, 0, -4503599627370496, 0, -2097152, 61440, 4227923712, + -70368744112384, -1, -576460752303427584, -65, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, 4611686018427387903) \ No newline at end of file diff --git a/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt new file mode 100644 index 0000000000..c8350db8c7 --- /dev/null +++ b/vim-engine/src/main/kotlin/com/maddyhome/idea/vim/common/Graphemes.kt @@ -0,0 +1,255 @@ +package com.maddyhome.idea.vim.common + +/** + * Move over unicode extended grapheme cluster boundaries. + * + * https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries + */ +public object Graphemes { + public fun next(charSeq: CharSequence, start: Int): Int? { + if (start < 0 || start >= charSeq.length) return null + + return charSeq.nextBoundary( + start, + next = Int::plus, + nextCode = { if (it < length) Character.codePointAt(this, it) else null }, + prevCode = { current, _ -> if (current > 0) Character.codePointBefore(this, current) else null }, + swap = false, + ) + } + + public fun prev(charSeq: CharSequence, start: Int): Int? { + if (start <= 0 || start > charSeq.length) return null + + return charSeq.nextBoundary( + start, + next = Int::minus, + nextCode = { if (it > 0) Character.codePointBefore(this, it) else null }, + prevCode = { current, charCount -> + if (current - charCount > 0) + Character.codePointBefore(this, current - charCount) + else null + }, + swap = true, + ) + } +} + +private inline fun CharSequence.nextBoundary( + start: Int, + crossinline next: (Int, Int) -> Int, + crossinline nextCode: CharSequence.(Int) -> Int?, + crossinline prevCode: CharSequence.(Int, Int) -> Int?, + swap: Boolean, +): Int { + var current = start + while (true) { + var codePoint = nextCode(current) ?: return current + val charCount = Character.charCount(codePoint) + var nextCodePoint = nextCode(next(current, charCount)) ?: return next(current, charCount) + val nextCharCount = Character.charCount(nextCodePoint) + + // Below the two code points are inspected in the direct order, following the grapheme breaking rules. + // To not duplicate the rules depending on the traversal direction, we ensure that the two code points + // are inspected in the same order by swapping them when we are traversing backwards. + if (swap) { + val temp = codePoint + codePoint = nextCodePoint + nextCodePoint = temp + } + + val type = classify(codePoint) + val nextType = classify(nextCodePoint) + + // GB3 - do not break CR x LF. + if (type == CodePointType.CR && nextType == CodePointType.LF) { + current = next(current, charCount) + continue + } + + // GB4 - break after Control, CR or LF. + if (type in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) + return next(current, charCount) + + // GB5 - break before Control, CR or LF. + if (nextType in arrayOf(CodePointType.CONTROL, CodePointType.CR, CodePointType.LF)) + return next(current, charCount) + + // GB6 - do not break Hangul syllable sequence. + if (type == CodePointType.L && nextType in arrayOf( + CodePointType.L, + CodePointType.V, + CodePointType.LV, + CodePointType.LVT + ) + ) { + current = next(current, charCount) + continue + } + + // GB7 - ditto. + if ((type == CodePointType.LV || type == CodePointType.V) && (nextType == CodePointType.V || nextType == CodePointType.T)) { + current = next(current, charCount) + continue + } + + // GB8 - ditto. + if ((type == CodePointType.LVT || type == CodePointType.T) && nextType == CodePointType.T) { + current = next(current, charCount) + continue + } + + // GB9, GB9a, GB9b - do not break before extending characters or ZWJ. + if (type == CodePointType.PREPEND || nextType in arrayOf( + CodePointType.EXTEND, + CodePointType.ZWJ, + CodePointType.SPACING_MARK + ) + ) { + current = next(current, charCount) + continue + } + + // GB11 - do not break within emoji modifier sequneces or emoji ZWJ sequences. + if (type == CodePointType.EXTENDED_PICTOGRAPHIC) { + if (nextType in arrayOf(CodePointType.EXTEND, CodePointType.ZWJ)) { + current = next(current, charCount) + continue + } + } + + if (type == CodePointType.EXTEND && nextType in arrayOf(CodePointType.EXTEND, CodePointType.ZWJ)) { + current = next(current, charCount) + continue + } + + if (type == CodePointType.ZWJ && nextType == CodePointType.EXTENDED_PICTOGRAPHIC) { + // Unlike nextCode, which will return either the one to the right (in direct order) or the one to the left + // (in reverse order) code point with respect to `current', prevCode will always return a code preceeding + // the two currently inspected (the one to the left of both of them). + val prevCodePoint = prevCode(current, charCount + nextCharCount) + if (prevCodePoint != null) { + val prevType = classify(prevCodePoint) + + if (prevType == CodePointType.EXTEND || prevType == CodePointType.EXTENDED_PICTOGRAPHIC) { + current = next(current, charCount) + continue + } + } + } + + // GB12, GB13 - do not break within emoji flag sequences. + if (type == CodePointType.REGIONAL_INDICATOR) { + val count = countPrev(current) { classify(it) == CodePointType.REGIONAL_INDICATOR } + if (nextType == CodePointType.REGIONAL_INDICATOR && count % 2 == 0) { + current = next(current, charCount) + continue + } + } + + // GB999 - otherwise, break everywhere. + return next(current, charCount) + } +} + +private inline fun CharSequence.countPrev(start: Int, crossinline pred: (Int) -> Boolean): Int { + var current = start + var count = 0 + while (current > 0) { + val codePoint = Character.codePointBefore(this, current) + if (!pred(codePoint)) + break + current -= Character.charCount(codePoint) + count++ + } + return count +} + +/** + * Returns the grapheme cluster break property value for the given code point. + * + * https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt + */ +private fun classify(codePoint: Int): CodePointType { + when (codePoint) { + 0xD -> return CodePointType.CR + 0xA -> return CodePointType.LF + in 0 until 0x20 -> return CodePointType.CONTROL + in 0 until 0x80 -> return CodePointType.OTHER + } + + if (isExtendedPictographic(codePoint)) + return CodePointType.EXTENDED_PICTOGRAPHIC + + val type = Character.getType(codePoint).toByte() + return when (type) { + Character.UNASSIGNED -> when (codePoint) { + in 0x2064..0x2069, in 0xFFF0..0xFFF8, 0xE0000, in 0xE0002..0xE001F, in 0xE0080..0xE00FF, in 0xE01F0..0xE0FFF -> CodePointType.CONTROL + else -> CodePointType.OTHER + } + + Character.MODIFIER_LETTER, Character.MODIFIER_SYMBOL -> when (codePoint) { + 0xFF9E, 0xFF9F, in 0x1F3FB..0x1F3FF -> CodePointType.EXTEND + else -> CodePointType.OTHER + } + + Character.FORMAT -> when (codePoint) { + 0x200D -> CodePointType.ZWJ + in 0x0600..0x0605, 0x06DD, 0x070F, in 0x0890..0x0891, 0x08E2, 0x110BD, 0x110CD -> CodePointType.PREPEND + 0x200C, in 0xE0020..0xE007F -> CodePointType.EXTEND + else -> CodePointType.CONTROL + } + + Character.LINE_SEPARATOR, Character.PARAGRAPH_SEPARATOR, Character.CONTROL -> CodePointType.CONTROL + + Character.OTHER_LETTER -> when (codePoint) { + 0x0D4E, in 0x111C2..0x111C3, 0x1193F, 0x11941, 0x11A3A, in 0x11A84..0x11A89, 0x11D46, 0x11F02 -> CodePointType.PREPEND + 0x0E33, 0x0EB3 -> CodePointType.SPACING_MARK + in 0x1100..0x115F, in 0xA960..0xA97C -> CodePointType.L + in 0x1160..0x11A7, in 0xD7B0..0xD7C6 -> CodePointType.V + in 0x11A8..0x11FF, in 0xD7CB..0xD7FB -> CodePointType.T + // LV is encountered every 28 characters, everything in-between is LVT. + in 0xAC00..0xD7A3 -> if ((codePoint - 0xAC00) % 28 == 0) CodePointType.LV else CodePointType.LVT + else -> CodePointType.OTHER + } + + Character.OTHER_SYMBOL -> when (codePoint) { + in 0x1F1E6..0x1F1FF -> CodePointType.REGIONAL_INDICATOR + else -> CodePointType.OTHER + } + + Character.NON_SPACING_MARK, Character.ENCLOSING_MARK -> CodePointType.EXTEND + + Character.COMBINING_SPACING_MARK -> when (codePoint) { + 0x09BE, 0x09D7, 0x0b3E, 0x0B57, 0x0BBE, 0x0BD7, 0x0CC2, in 0x0CD5..0x0CD6, 0x0D3E, 0x0D57, 0x0DCF, + 0x0DDF, 0x1B35, in 0x302E..0x302F, 0x1133E, 0x11357, 0x114B0, 0x114BD, 0x115AF, 0x11930, 0x1D165, + in 0x1D16E..0x1D172 -> CodePointType.EXTEND + + 0x102B, 0x102C, 0x1038, in 0x1062..0x1064, in 0x1067..0x106D, 0x1083, in 0x1087..0x108C, 0x108F, in 0x109A..0x109C, + 0x1A61, 0x1A63, 0x1A64, 0xAA7B, 0xAA7D -> CodePointType.OTHER + + else -> CodePointType.SPACING_MARK + } + + else -> CodePointType.OTHER + } +} + +private enum class CodePointType { + CR, + LF, + CONTROL, + EXTEND, + ZWJ, + REGIONAL_INDICATOR, + PREPEND, + SPACING_MARK, + EXTENDED_PICTOGRAPHIC, + L, + V, + T, + LV, + LVT, + OTHER, +} +