From 6718bbdba15298a8b31b684337b58ccb3b3f11c2 Mon Sep 17 00:00:00 2001
From: Ayman Bagabas <ayman.bagabas@gmail.com>
Date: Thu, 17 Oct 2024 18:07:15 -0400
Subject: [PATCH] feat(ansi): add method type for cell width calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds a new type `Method` to the `ansi` package that represents how
to calculate the cell widths in the terminal. The default is to use
`GraphemeWidth` which keeps the current behavior. In order to use
`WcWidth`, you can call the member functions on the `Method` type.

```go
ansi.StringWidth("👩‍👩‍👧‍👧") // 2
ansi.GraphemeWidth.StringWidth("👩‍👩‍👧‍👧") // 2
ansi.WcWidth.StringWidth("👩‍👩‍👧‍👧") // 6
```
---
 ansi/go.mod           |   7 ++-
 ansi/go.sum           |   4 ++
 ansi/method.go        |  13 +++++
 ansi/parser_decode.go | 112 +++++++++++++++++++++++++++++++++++++++++-
 ansi/truncate.go      |  12 +++++
 ansi/width.go         |  16 +++++-
 ansi/width_test.go    |   2 +-
 ansi/wrap.go          |  50 ++++++++++++++++++-
 cellbuf/grid.go       |   9 ++--
 cellbuf/grid_write.go |  34 +++----------
 cellbuf/method.go     |  11 -----
 11 files changed, 221 insertions(+), 49 deletions(-)
 create mode 100644 ansi/method.go
 delete mode 100644 cellbuf/method.go

diff --git a/ansi/go.mod b/ansi/go.mod
index 2ef7be15..1ca0341b 100644
--- a/ansi/go.mod
+++ b/ansi/go.mod
@@ -2,4 +2,9 @@ module github.com/charmbracelet/x/ansi
 
 go 1.18
 
-require github.com/rivo/uniseg v0.4.7
+require (
+	github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee
+	github.com/rivo/uniseg v0.4.7
+)
+
+require golang.org/x/text v0.19.0 // indirect
diff --git a/ansi/go.sum b/ansi/go.sum
index 9008848b..f6bf5448 100644
--- a/ansi/go.sum
+++ b/ansi/go.sum
@@ -1,2 +1,6 @@
+github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee h1:pPwlYuFaYNPhFRqbwEqlRyq5PwLdLInR4zAlh9e7ad0=
+github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee/go.mod h1:Ey8PFmYwH+/td9bpiEx07Fdx9ZVkxfIjWXxBluxF4Nw=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
+golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
diff --git a/ansi/method.go b/ansi/method.go
new file mode 100644
index 00000000..1181e59e
--- /dev/null
+++ b/ansi/method.go
@@ -0,0 +1,13 @@
+package ansi
+
+// Method is a type that represents the how to calculate the cell widths in the
+// terminal. The default is to use [WcWidth]. Some terminals use grapheme
+// clustering by default. Some support mode 2027 to tell the terminal to use
+// mode 2027 instead of wcwidth.
+type Method uint8
+
+// Display width modes.
+const (
+	WcWidth Method = iota
+	GraphemeWidth
+)
diff --git a/ansi/parser_decode.go b/ansi/parser_decode.go
index 76688d0b..1437ddf1 100644
--- a/ansi/parser_decode.go
+++ b/ansi/parser_decode.go
@@ -6,6 +6,7 @@ import (
 	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -67,7 +68,113 @@ const (
 //		state = newState
 //		input = input[n:]
 //	}
-func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
+func DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) {
+	return GraphemeWidth.DecodeSequence(b, state, p)
+}
+
+// DecodeSequenceInString is like [DecodeSequence] but for strings.
+func DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) {
+	return GraphemeWidth.DecodeSequenceInString(s, state, p)
+}
+
+// DecodeSequence decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Cmd] and [Param] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [p.Cmd] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequence(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func (m Method) DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) {
+	return decodeSequence(m, b, state, p)
+}
+
+// DecodeSequenceInString is like [DecodeSequence] but for strings.
+func (m Method) DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) {
+	return decodeSequence(m, s, state, p)
+}
+
+// decodeSequence decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Cmd] and [Param] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [p.Cmd] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequence(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func decodeSequence[T string | []byte](m Method, b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
 	for i := 0; i < len(b); i++ {
 		c := b[i]
 
@@ -122,6 +229,9 @@ func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width
 
 			if utf8.RuneStart(c) {
 				seq, _, width, _ = FirstGraphemeCluster(b, -1)
+				if m == WcWidth {
+					width = wcwidth.StringWidth(string(seq))
+				}
 				i += len(seq)
 				return b[:i], width, i, NormalState
 			}
diff --git a/ansi/truncate.go b/ansi/truncate.go
index db0782c8..f9849ba4 100644
--- a/ansi/truncate.go
+++ b/ansi/truncate.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -12,6 +13,14 @@ import (
 // This function is aware of ANSI escape codes and will not break them, and
 // accounts for wide-characters (such as East Asians and emojis).
 func Truncate(s string, length int, tail string) string {
+	return GraphemeWidth.Truncate(s, length, tail)
+}
+
+// Truncate truncates a string to a given length, adding a tail to the
+// end if the string is longer than the given length.
+// This function is aware of ANSI escape codes and will not break them, and
+// accounts for wide-characters (such as East Asians and emojis).
+func (m Method) Truncate(s string, length int, tail string) string {
 	if sw := StringWidth(s); sw <= length {
 		return s
 	}
@@ -41,6 +50,9 @@ func Truncate(s string, length int, tail string) string {
 			// This action happens when we transition to the Utf8State.
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = wcwidth.StringWidth(string(cluster))
+			}
 
 			// increment the index by the length of the cluster
 			i += len(cluster)
diff --git a/ansi/width.go b/ansi/width.go
index 80890e42..f39ffe88 100644
--- a/ansi/width.go
+++ b/ansi/width.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -63,6 +64,14 @@ func Strip(s string) string {
 // codes are ignored and wide characters (such as East Asians and emojis) are
 // accounted for.
 func StringWidth(s string) int {
+	return GraphemeWidth.StringWidth(s)
+}
+
+// StringWidth returns the width of a string in cells. This is the number of
+// cells that the string will occupy when printed in a terminal. ANSI escape
+// codes are ignored and wide characters (such as East Asians and emojis) are
+// accounted for.
+func (m Method) StringWidth(s string) int {
 	if s == "" {
 		return 0
 	}
@@ -78,7 +87,12 @@ func StringWidth(s string) int {
 		if state == parser.Utf8State {
 			var w int
 			cluster, _, w, _ = uniseg.FirstGraphemeClusterInString(s[i:], -1)
-			width += w
+			switch m {
+			case WcWidth:
+				width += wcwidth.StringWidth(cluster)
+			case GraphemeWidth:
+				width += w
+			}
 			i += len(cluster) - 1
 			pstate = parser.GroundState
 			continue
diff --git a/ansi/width_test.go b/ansi/width_test.go
index 2c41bc7b..4822466f 100644
--- a/ansi/width_test.go
+++ b/ansi/width_test.go
@@ -47,7 +47,7 @@ func TestStrip(t *testing.T) {
 func TestStringWidth(t *testing.T) {
 	for i, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			if width := StringWidth(c.input); width != c.width {
+			if width := GraphemeWidth.StringWidth(c.input); width != c.width {
 				t.Errorf("test case %d failed: expected %d, got %d", i+1, c.width, width)
 			}
 		})
diff --git a/ansi/wrap.go b/ansi/wrap.go
index d080a77a..7144bad4 100644
--- a/ansi/wrap.go
+++ b/ansi/wrap.go
@@ -6,6 +6,7 @@ import (
 	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -18,6 +19,15 @@ const nbsp = 0xA0
 // When preserveSpace is true, spaces at the beginning of a line will be
 // preserved.
 func Hardwrap(s string, limit int, preserveSpace bool) string {
+	return GraphemeWidth.Hardwrap(s, limit, preserveSpace)
+}
+
+// Hardwrap wraps a string or a block of text to a given line length, breaking
+// word boundaries. This will preserve ANSI escape codes and will account for
+// wide-characters in the string.
+// When preserveSpace is true, spaces at the beginning of a line will be
+// preserved.
+func (m Method) Hardwrap(s string, limit int, preserveSpace bool) string {
 	if limit < 1 {
 		return s
 	}
@@ -56,7 +66,12 @@ func Hardwrap(s string, limit int, preserveSpace bool) string {
 			}
 
 			buf.Write(cluster)
-			curWidth += width
+			switch m {
+			case WcWidth:
+				curWidth += wcwidth.StringWidth(string(cluster))
+			case GraphemeWidth:
+				curWidth += width
+			}
 			pstate = parser.GroundState
 			continue
 		}
@@ -107,6 +122,18 @@ func Hardwrap(s string, limit int, preserveSpace bool) string {
 //
 // Note: breakpoints must be a string of 1-cell wide rune characters.
 func Wordwrap(s string, limit int, breakpoints string) string {
+	return GraphemeWidth.Wordwrap(s, limit, breakpoints)
+}
+
+// Wordwrap wraps a string or a block of text to a given line length, not
+// breaking word boundaries. This will preserve ANSI escape codes and will
+// account for wide-characters in the string.
+// The breakpoints string is a list of characters that are considered
+// breakpoints for word wrapping. A hyphen (-) is always considered a
+// breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wordwrap(s string, limit int, breakpoints string) string {
 	if limit < 1 {
 		return s
 	}
@@ -165,7 +192,12 @@ func Wordwrap(s string, limit int, breakpoints string) string {
 				curWidth++
 			} else {
 				word.Write(cluster)
-				wordLen += width
+				switch m {
+				case WcWidth:
+					wordLen += wcwidth.StringWidth(string(cluster))
+				case GraphemeWidth:
+					wordLen += width
+				}
 				if curWidth+space.Len()+wordLen > limit &&
 					wordLen < limit {
 					addNewline()
@@ -235,6 +267,17 @@ func Wordwrap(s string, limit int, breakpoints string) string {
 //
 // Note: breakpoints must be a string of 1-cell wide rune characters.
 func Wrap(s string, limit int, breakpoints string) string {
+	return GraphemeWidth.Wrap(s, limit, breakpoints)
+}
+
+// Wrap wraps a string or a block of text to a given line length, breaking word
+// boundaries if necessary. This will preserve ANSI escape codes and will
+// account for wide-characters in the string. The breakpoints string is a list
+// of characters that are considered breakpoints for word wrapping. A hyphen
+// (-) is always considered a breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wrap(s string, limit int, breakpoints string) string {
 	if limit < 1 {
 		return s
 	}
@@ -280,6 +323,9 @@ func Wrap(s string, limit int, breakpoints string) string {
 		if state == parser.Utf8State {
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = wcwidth.StringWidth(string(cluster))
+			}
 			i += len(cluster)
 
 			r, _ := utf8.DecodeRune(cluster)
diff --git a/cellbuf/grid.go b/cellbuf/grid.go
index 8a5d212e..13de1114 100644
--- a/cellbuf/grid.go
+++ b/cellbuf/grid.go
@@ -3,7 +3,6 @@ package cellbuf
 import (
 	"bytes"
 	"strings"
-	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi"
 )
@@ -34,13 +33,13 @@ type Grid interface {
 
 // SetContentAt writes the given data to the grid starting from the given
 // position and with the given width and height.
-func (m WidthMethod) SetContentAt(b Grid, c string, x, y, w, h int) []int {
-	return setContent(b, c, x, y, w, h, m, strings.ReplaceAll, utf8.DecodeRuneInString)
+func SetContentAt(m ansi.Method, b Grid, c string, x, y, w, h int) []int {
+	return setContent(b, c, x, y, w, h, m)
 }
 
 // SetContent writes the given data to the grid starting from the first cell.
-func (m WidthMethod) SetContent(g Grid, content string) []int {
-	return m.SetContentAt(g, content, 0, 0, g.Width(), Height(content))
+func SetContent(m ansi.Method, g Grid, content string) []int {
+	return SetContentAt(m, g, content, 0, 0, g.Width(), Height(content))
 }
 
 // Render returns a string representation of the grid with ANSI escape sequences.
diff --git a/cellbuf/grid_write.go b/cellbuf/grid_write.go
index eef10453..fd0e0a9b 100644
--- a/cellbuf/grid_write.go
+++ b/cellbuf/grid_write.go
@@ -2,26 +2,19 @@ package cellbuf
 
 import (
 	"bytes"
-	"unicode/utf8"
+	"strings"
 
 	"github.com/charmbracelet/x/ansi"
-	"github.com/charmbracelet/x/wcwidth"
 )
 
 // setContent writes the given data to the buffer starting from the first cell.
 // It accepts both string and []byte data types.
-func setContent[
-	T string | []byte,
-	TReplaceAllFunc func(s T, old T, new T) T, //nolint:predeclared
-	TDecodeRuneFunc func(p T) (rune, int),
-](
+func setContent(
 	buf Grid,
-	data T,
+	data string,
 	x, y int,
 	w, h int,
-	method WidthMethod,
-	replaceAll TReplaceAllFunc,
-	decodeRune TDecodeRuneFunc,
+	method ansi.Method,
 ) []int {
 	var cell Cell
 	var pen Style
@@ -30,7 +23,7 @@ func setContent[
 
 	p := ansi.GetParser()
 	defer ansi.PutParser(p)
-	data = replaceAll(data, T("\r\n"), T("\n"))
+	data = strings.ReplaceAll(data, "\r\n", "\n")
 
 	// linew is a slice of line widths. We use this to keep track of the
 	// written widths of each line. We use this information later to optimize
@@ -41,23 +34,10 @@ func setContent[
 
 	var state byte
 	for len(data) > 0 {
-		seq, width, n, newState := ansi.DecodeSequence(data, state, p)
+		seq, width, n, newState := method.DecodeSequenceInString(data, state, p)
 
 		switch width {
 		case 2, 3, 4: // wide cells can go up to 4 cells wide
-
-			switch method {
-			case WcWidth:
-				if r, rw := decodeRune(data); r != utf8.RuneError {
-					n = rw
-					width = wcwidth.RuneWidth(r)
-					seq = T(string(r))
-					newState = 0
-				}
-			case GraphemeWidth:
-				// [ansi.DecodeSequence] already handles grapheme clusters
-			}
-
 			// Mark wide cells with emptyCell zero width
 			// We set the wide cell down below
 			for j := 1; j < width; j++ {
@@ -227,7 +207,7 @@ func setContent[
 					link.URLID = id
 					link.URL = string(params[2])
 				}
-			case ansi.Equal(seq, T("\n")):
+			case ansi.Equal(seq, "\n"):
 				// Reset the rest of the line
 				for x < w {
 					buf.Set(x, y, spaceCell) //nolint:errcheck
diff --git a/cellbuf/method.go b/cellbuf/method.go
deleted file mode 100644
index 8bc7de2d..00000000
--- a/cellbuf/method.go
+++ /dev/null
@@ -1,11 +0,0 @@
-package cellbuf
-
-// WidthMethod is a type that represents the how the renderer should calculate
-// the display width of cells.
-type WidthMethod uint8
-
-// Display width modes.
-const (
-	WcWidth WidthMethod = iota
-	GraphemeWidth
-)