feat(ansi): add method type for cell width calculation

This adds a new type `Method` to the `ansi` package that represents how to calculate the cell widths in the terminal. The default is to use `GraphemeWidth` which keeps the current behavior. In order to use `WcWidth`, you can call the member functions on the `Method` type. ```go ansi.StringWidth("👩‍👩‍👧‍👧") // 2 ansi.GraphemeWidth.StringWidth("👩‍👩‍👧‍👧") // 2 ansi.WcWidth.StringWidth("👩‍👩‍👧‍👧") // 6 ```
charmbracelet · Oct 17, 2024 · 6718bbd · 6718bbd
1 parent 0e84032
commit 6718bbd
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 49 deletions.
diff --git a/ansi/go.mod b/ansi/go.mod
@@ -2,4 +2,9 @@ module github.com/charmbracelet/x/ansi
 
 go 1.18
 
-require github.com/rivo/uniseg v0.4.7
+require (
+	github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee
+	github.com/rivo/uniseg v0.4.7
+)
+
+require golang.org/x/text v0.19.0 // indirect
diff --git a/ansi/go.sum b/ansi/go.sum
@@ -1,2 +1,6 @@
+github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee h1:pPwlYuFaYNPhFRqbwEqlRyq5PwLdLInR4zAlh9e7ad0=
+github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee/go.mod h1:Ey8PFmYwH+/td9bpiEx07Fdx9ZVkxfIjWXxBluxF4Nw=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
+golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
diff --git a/ansi/method.go b/ansi/method.go
@@ -0,0 +1,13 @@
+package ansi
+
+// Method is a type that represents the how to calculate the cell widths in the
+// terminal. The default is to use [WcWidth]. Some terminals use grapheme
+// clustering by default. Some support mode 2027 to tell the terminal to use
+// mode 2027 instead of wcwidth.
+type Method uint8
+
+// Display width modes.
+const (
+	WcWidth Method = iota
+	GraphemeWidth
+)
diff --git a/ansi/parser_decode.go b/ansi/parser_decode.go
@@ -6,6 +6,7 @@ import (
 	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -67,7 +68,113 @@ const (
 //		state = newState
 //		input = input[n:]
 //	}
-func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
+func DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) {
+	return GraphemeWidth.DecodeSequence(b, state, p)
+}
+
+// DecodeSequenceInString is like [DecodeSequence] but for strings.
+func DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) {
+	return GraphemeWidth.DecodeSequenceInString(s, state, p)
+}
+
+// DecodeSequence decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Cmd] and [Param] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [p.Cmd] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequence(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func (m Method) DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) {
+	return decodeSequence(m, b, state, p)
+}
+
+// DecodeSequenceInString is like [DecodeSequence] but for strings.
+func (m Method) DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) {
+	return decodeSequence(m, s, state, p)
+}
+
+// decodeSequence decodes the first ANSI escape sequence or a printable
+// grapheme from the given data. It returns the sequence slice, the number of
+// bytes read, the cell width for each sequence, and the new state.
+//
+// The cell width will always be 0 for control and escape sequences, 1 for
+// ASCII printable characters, and the number of cells other Unicode characters
+// occupy. It uses the uniseg package to calculate the width of Unicode
+// graphemes and characters. This means it will always do grapheme clustering
+// (mode 2027).
+//
+// Passing a non-nil [*Parser] as the last argument will allow the decoder to
+// collect sequence parameters, data, and commands. The parser cmd will have
+// the packed command value that contains intermediate and marker characters.
+// In the case of a OSC sequence, the cmd will be the OSC command number. Use
+// [Cmd] and [Param] types to unpack command intermediates and markers as well
+// as parameters.
+//
+// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the
+// validity of other data sequences, OSC, DCS, etc, will require checking for
+// the returned sequence terminator bytes such as ST (ESC \\) and BEL).
+//
+// We store the command byte in [p.Cmd] in the most significant byte, the
+// marker byte in the next byte, and the intermediate byte in the least
+// significant byte. This is done to avoid using a struct to store the command
+// and its intermediates and markers. The command byte is always the least
+// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the
+// command, intermediate, and marker bytes. Note that we only collect the last
+// marker character and intermediate byte.
+//
+// The [p.Params] slice will contain the parameters of the sequence. Any
+// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type
+// to unpack the parameters.
+//
+// Example:
+//
+//	var state byte // the initial state is always zero [NormalState]
+//	p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional)
+//	input := []byte("\x1b[31mHello, World!\x1b[0m")
+//	for len(input) > 0 {
+//		seq, width, n, newState := DecodeSequence(input, state, p)
+//		log.Printf("seq: %q, width: %d", seq, width)
+//		state = newState
+//		input = input[n:]
+//	}
+func decodeSequence[T string | []byte](m Method, b T, state byte, p *Parser) (seq T, width int, n int, newState byte) {
 	for i := 0; i < len(b); i++ {
 		c := b[i]
 
@@ -122,6 +229,9 @@ func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width
 
 			if utf8.RuneStart(c) {
 				seq, _, width, _ = FirstGraphemeCluster(b, -1)
+				if m == WcWidth {
+					width = wcwidth.StringWidth(string(seq))
+				}
 				i += len(seq)
 				return b[:i], width, i, NormalState
 			}

diff --git a/ansi/truncate.go b/ansi/truncate.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -12,6 +13,14 @@ import (
 // This function is aware of ANSI escape codes and will not break them, and
 // accounts for wide-characters (such as East Asians and emojis).
 func Truncate(s string, length int, tail string) string {
+	return GraphemeWidth.Truncate(s, length, tail)
+}
+
+// Truncate truncates a string to a given length, adding a tail to the
+// end if the string is longer than the given length.
+// This function is aware of ANSI escape codes and will not break them, and
+// accounts for wide-characters (such as East Asians and emojis).
+func (m Method) Truncate(s string, length int, tail string) string {
 	if sw := StringWidth(s); sw <= length {
 		return s
 	}
@@ -41,6 +50,9 @@ func Truncate(s string, length int, tail string) string {
 			// This action happens when we transition to the Utf8State.
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = wcwidth.StringWidth(string(cluster))
+			}
 
 			// increment the index by the length of the cluster
 			i += len(cluster)

diff --git a/ansi/width.go b/ansi/width.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -63,6 +64,14 @@ func Strip(s string) string {
 // codes are ignored and wide characters (such as East Asians and emojis) are
 // accounted for.
 func StringWidth(s string) int {
+	return GraphemeWidth.StringWidth(s)
+}
+
+// StringWidth returns the width of a string in cells. This is the number of
+// cells that the string will occupy when printed in a terminal. ANSI escape
+// codes are ignored and wide characters (such as East Asians and emojis) are
+// accounted for.
+func (m Method) StringWidth(s string) int {
 	if s == "" {
 		return 0
 	}
@@ -78,7 +87,12 @@ func StringWidth(s string) int {
 		if state == parser.Utf8State {
 			var w int
 			cluster, _, w, _ = uniseg.FirstGraphemeClusterInString(s[i:], -1)
-			width += w
+			switch m {
+			case WcWidth:
+				width += wcwidth.StringWidth(cluster)
+			case GraphemeWidth:
+				width += w
+			}
 			i += len(cluster) - 1
 			pstate = parser.GroundState
 			continue

diff --git a/ansi/width_test.go b/ansi/width_test.go
@@ -47,7 +47,7 @@ func TestStrip(t *testing.T) {
 func TestStringWidth(t *testing.T) {
 	for i, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
-			if width := StringWidth(c.input); width != c.width {
+			if width := GraphemeWidth.StringWidth(c.input); width != c.width {
 				t.Errorf("test case %d failed: expected %d, got %d", i+1, c.width, width)
 			}
 		})

diff --git a/ansi/wrap.go b/ansi/wrap.go
@@ -6,6 +6,7 @@ import (
 	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi/parser"
+	"github.com/charmbracelet/x/wcwidth"
 	"github.com/rivo/uniseg"
 )
 
@@ -18,6 +19,15 @@ const nbsp = 0xA0
 // When preserveSpace is true, spaces at the beginning of a line will be
 // preserved.
 func Hardwrap(s string, limit int, preserveSpace bool) string {
+	return GraphemeWidth.Hardwrap(s, limit, preserveSpace)
+}
+
+// Hardwrap wraps a string or a block of text to a given line length, breaking
+// word boundaries. This will preserve ANSI escape codes and will account for
+// wide-characters in the string.
+// When preserveSpace is true, spaces at the beginning of a line will be
+// preserved.
+func (m Method) Hardwrap(s string, limit int, preserveSpace bool) string {
 	if limit < 1 {
 		return s
 	}
@@ -56,7 +66,12 @@ func Hardwrap(s string, limit int, preserveSpace bool) string {
 			}
 
 			buf.Write(cluster)
-			curWidth += width
+			switch m {
+			case WcWidth:
+				curWidth += wcwidth.StringWidth(string(cluster))
+			case GraphemeWidth:
+				curWidth += width
+			}
 			pstate = parser.GroundState
 			continue
 		}
@@ -107,6 +122,18 @@ func Hardwrap(s string, limit int, preserveSpace bool) string {
 //
 // Note: breakpoints must be a string of 1-cell wide rune characters.
 func Wordwrap(s string, limit int, breakpoints string) string {
+	return GraphemeWidth.Wordwrap(s, limit, breakpoints)
+}
+
+// Wordwrap wraps a string or a block of text to a given line length, not
+// breaking word boundaries. This will preserve ANSI escape codes and will
+// account for wide-characters in the string.
+// The breakpoints string is a list of characters that are considered
+// breakpoints for word wrapping. A hyphen (-) is always considered a
+// breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wordwrap(s string, limit int, breakpoints string) string {
 	if limit < 1 {
 		return s
 	}
@@ -165,7 +192,12 @@ func Wordwrap(s string, limit int, breakpoints string) string {
 				curWidth++
 			} else {
 				word.Write(cluster)
-				wordLen += width
+				switch m {
+				case WcWidth:
+					wordLen += wcwidth.StringWidth(string(cluster))
+				case GraphemeWidth:
+					wordLen += width
+				}
 				if curWidth+space.Len()+wordLen > limit &&
 					wordLen < limit {
 					addNewline()
@@ -235,6 +267,17 @@ func Wordwrap(s string, limit int, breakpoints string) string {
 //
 // Note: breakpoints must be a string of 1-cell wide rune characters.
 func Wrap(s string, limit int, breakpoints string) string {
+	return GraphemeWidth.Wrap(s, limit, breakpoints)
+}
+
+// Wrap wraps a string or a block of text to a given line length, breaking word
+// boundaries if necessary. This will preserve ANSI escape codes and will
+// account for wide-characters in the string. The breakpoints string is a list
+// of characters that are considered breakpoints for word wrapping. A hyphen
+// (-) is always considered a breakpoint.
+//
+// Note: breakpoints must be a string of 1-cell wide rune characters.
+func (m Method) Wrap(s string, limit int, breakpoints string) string {
 	if limit < 1 {
 		return s
 	}
@@ -280,6 +323,9 @@ func Wrap(s string, limit int, breakpoints string) string {
 		if state == parser.Utf8State {
 			var width int
 			cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1)
+			if m == WcWidth {
+				width = wcwidth.StringWidth(string(cluster))
+			}
 			i += len(cluster)
 
 			r, _ := utf8.DecodeRune(cluster)

diff --git a/cellbuf/grid.go b/cellbuf/grid.go
@@ -3,7 +3,6 @@ package cellbuf
 import (
 	"bytes"
 	"strings"
-	"unicode/utf8"
 
 	"github.com/charmbracelet/x/ansi"
 )
@@ -34,13 +33,13 @@ type Grid interface {
 
 // SetContentAt writes the given data to the grid starting from the given
 // position and with the given width and height.
-func (m WidthMethod) SetContentAt(b Grid, c string, x, y, w, h int) []int {
-	return setContent(b, c, x, y, w, h, m, strings.ReplaceAll, utf8.DecodeRuneInString)
+func SetContentAt(m ansi.Method, b Grid, c string, x, y, w, h int) []int {
+	return setContent(b, c, x, y, w, h, m)
 }
 
 // SetContent writes the given data to the grid starting from the first cell.
-func (m WidthMethod) SetContent(g Grid, content string) []int {
-	return m.SetContentAt(g, content, 0, 0, g.Width(), Height(content))
+func SetContent(m ansi.Method, g Grid, content string) []int {
+	return SetContentAt(m, g, content, 0, 0, g.Width(), Height(content))
 }
 
 // Render returns a string representation of the grid with ANSI escape sequences.