From 6718bbdba15298a8b31b684337b58ccb3b3f11c2 Mon Sep 17 00:00:00 2001 From: Ayman Bagabas Date: Thu, 17 Oct 2024 18:07:15 -0400 Subject: [PATCH] feat(ansi): add method type for cell width calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new type `Method` to the `ansi` package that represents how to calculate the cell widths in the terminal. The default is to use `GraphemeWidth` which keeps the current behavior. In order to use `WcWidth`, you can call the member functions on the `Method` type. ```go ansi.StringWidth("๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง") // 2 ansi.GraphemeWidth.StringWidth("๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง") // 2 ansi.WcWidth.StringWidth("๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง") // 6 ``` --- ansi/go.mod | 7 ++- ansi/go.sum | 4 ++ ansi/method.go | 13 +++++ ansi/parser_decode.go | 112 +++++++++++++++++++++++++++++++++++++++++- ansi/truncate.go | 12 +++++ ansi/width.go | 16 +++++- ansi/width_test.go | 2 +- ansi/wrap.go | 50 ++++++++++++++++++- cellbuf/grid.go | 9 ++-- cellbuf/grid_write.go | 34 +++---------- cellbuf/method.go | 11 ----- 11 files changed, 221 insertions(+), 49 deletions(-) create mode 100644 ansi/method.go delete mode 100644 cellbuf/method.go diff --git a/ansi/go.mod b/ansi/go.mod index 2ef7be15..1ca0341b 100644 --- a/ansi/go.mod +++ b/ansi/go.mod @@ -2,4 +2,9 @@ module github.com/charmbracelet/x/ansi go 1.18 -require github.com/rivo/uniseg v0.4.7 +require ( + github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee + github.com/rivo/uniseg v0.4.7 +) + +require golang.org/x/text v0.19.0 // indirect diff --git a/ansi/go.sum b/ansi/go.sum index 9008848b..f6bf5448 100644 --- a/ansi/go.sum +++ b/ansi/go.sum @@ -1,2 +1,6 @@ +github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee h1:pPwlYuFaYNPhFRqbwEqlRyq5PwLdLInR4zAlh9e7ad0= +github.com/charmbracelet/x/wcwidth v0.0.0-20241017213443-f2394f742aee/go.mod h1:Ey8PFmYwH+/td9bpiEx07Fdx9ZVkxfIjWXxBluxF4Nw= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= diff --git a/ansi/method.go b/ansi/method.go new file mode 100644 index 00000000..1181e59e --- /dev/null +++ b/ansi/method.go @@ -0,0 +1,13 @@ +package ansi + +// Method is a type that represents the how to calculate the cell widths in the +// terminal. The default is to use [WcWidth]. Some terminals use grapheme +// clustering by default. Some support mode 2027 to tell the terminal to use +// mode 2027 instead of wcwidth. +type Method uint8 + +// Display width modes. +const ( + WcWidth Method = iota + GraphemeWidth +) diff --git a/ansi/parser_decode.go b/ansi/parser_decode.go index 76688d0b..1437ddf1 100644 --- a/ansi/parser_decode.go +++ b/ansi/parser_decode.go @@ -6,6 +6,7 @@ import ( "unicode/utf8" "github.com/charmbracelet/x/ansi/parser" + "github.com/charmbracelet/x/wcwidth" "github.com/rivo/uniseg" ) @@ -67,7 +68,113 @@ const ( // state = newState // input = input[n:] // } -func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width int, n int, newState byte) { +func DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) { + return GraphemeWidth.DecodeSequence(b, state, p) +} + +// DecodeSequenceInString is like [DecodeSequence] but for strings. +func DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) { + return GraphemeWidth.DecodeSequenceInString(s, state, p) +} + +// DecodeSequence decodes the first ANSI escape sequence or a printable +// grapheme from the given data. It returns the sequence slice, the number of +// bytes read, the cell width for each sequence, and the new state. +// +// The cell width will always be 0 for control and escape sequences, 1 for +// ASCII printable characters, and the number of cells other Unicode characters +// occupy. It uses the uniseg package to calculate the width of Unicode +// graphemes and characters. This means it will always do grapheme clustering +// (mode 2027). +// +// Passing a non-nil [*Parser] as the last argument will allow the decoder to +// collect sequence parameters, data, and commands. The parser cmd will have +// the packed command value that contains intermediate and marker characters. +// In the case of a OSC sequence, the cmd will be the OSC command number. Use +// [Cmd] and [Param] types to unpack command intermediates and markers as well +// as parameters. +// +// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the +// validity of other data sequences, OSC, DCS, etc, will require checking for +// the returned sequence terminator bytes such as ST (ESC \\) and BEL). +// +// We store the command byte in [p.Cmd] in the most significant byte, the +// marker byte in the next byte, and the intermediate byte in the least +// significant byte. This is done to avoid using a struct to store the command +// and its intermediates and markers. The command byte is always the least +// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the +// command, intermediate, and marker bytes. Note that we only collect the last +// marker character and intermediate byte. +// +// The [p.Params] slice will contain the parameters of the sequence. Any +// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type +// to unpack the parameters. +// +// Example: +// +// var state byte // the initial state is always zero [NormalState] +// p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional) +// input := []byte("\x1b[31mHello, World!\x1b[0m") +// for len(input) > 0 { +// seq, width, n, newState := DecodeSequence(input, state, p) +// log.Printf("seq: %q, width: %d", seq, width) +// state = newState +// input = input[n:] +// } +func (m Method) DecodeSequence(b []byte, state byte, p *Parser) (seq []byte, width int, n int, newState byte) { + return decodeSequence(m, b, state, p) +} + +// DecodeSequenceInString is like [DecodeSequence] but for strings. +func (m Method) DecodeSequenceInString(s string, state byte, p *Parser) (seq string, width int, n int, newState byte) { + return decodeSequence(m, s, state, p) +} + +// decodeSequence decodes the first ANSI escape sequence or a printable +// grapheme from the given data. It returns the sequence slice, the number of +// bytes read, the cell width for each sequence, and the new state. +// +// The cell width will always be 0 for control and escape sequences, 1 for +// ASCII printable characters, and the number of cells other Unicode characters +// occupy. It uses the uniseg package to calculate the width of Unicode +// graphemes and characters. This means it will always do grapheme clustering +// (mode 2027). +// +// Passing a non-nil [*Parser] as the last argument will allow the decoder to +// collect sequence parameters, data, and commands. The parser cmd will have +// the packed command value that contains intermediate and marker characters. +// In the case of a OSC sequence, the cmd will be the OSC command number. Use +// [Cmd] and [Param] types to unpack command intermediates and markers as well +// as parameters. +// +// Zero [p.Cmd] means the CSI, DCS, or ESC sequence is invalid. Moreover, checking the +// validity of other data sequences, OSC, DCS, etc, will require checking for +// the returned sequence terminator bytes such as ST (ESC \\) and BEL). +// +// We store the command byte in [p.Cmd] in the most significant byte, the +// marker byte in the next byte, and the intermediate byte in the least +// significant byte. This is done to avoid using a struct to store the command +// and its intermediates and markers. The command byte is always the least +// significant byte i.e. [p.Cmd & 0xff]. Use the [Cmd] type to unpack the +// command, intermediate, and marker bytes. Note that we only collect the last +// marker character and intermediate byte. +// +// The [p.Params] slice will contain the parameters of the sequence. Any +// sub-parameter will have the [parser.HasMoreFlag] set. Use the [Param] type +// to unpack the parameters. +// +// Example: +// +// var state byte // the initial state is always zero [NormalState] +// p := NewParser(32, 1024) // create a new parser with a 32 params buffer and 1024 data buffer (optional) +// input := []byte("\x1b[31mHello, World!\x1b[0m") +// for len(input) > 0 { +// seq, width, n, newState := DecodeSequence(input, state, p) +// log.Printf("seq: %q, width: %d", seq, width) +// state = newState +// input = input[n:] +// } +func decodeSequence[T string | []byte](m Method, b T, state byte, p *Parser) (seq T, width int, n int, newState byte) { for i := 0; i < len(b); i++ { c := b[i] @@ -122,6 +229,9 @@ func DecodeSequence[T string | []byte](b T, state byte, p *Parser) (seq T, width if utf8.RuneStart(c) { seq, _, width, _ = FirstGraphemeCluster(b, -1) + if m == WcWidth { + width = wcwidth.StringWidth(string(seq)) + } i += len(seq) return b[:i], width, i, NormalState } diff --git a/ansi/truncate.go b/ansi/truncate.go index db0782c8..f9849ba4 100644 --- a/ansi/truncate.go +++ b/ansi/truncate.go @@ -4,6 +4,7 @@ import ( "bytes" "github.com/charmbracelet/x/ansi/parser" + "github.com/charmbracelet/x/wcwidth" "github.com/rivo/uniseg" ) @@ -12,6 +13,14 @@ import ( // This function is aware of ANSI escape codes and will not break them, and // accounts for wide-characters (such as East Asians and emojis). func Truncate(s string, length int, tail string) string { + return GraphemeWidth.Truncate(s, length, tail) +} + +// Truncate truncates a string to a given length, adding a tail to the +// end if the string is longer than the given length. +// This function is aware of ANSI escape codes and will not break them, and +// accounts for wide-characters (such as East Asians and emojis). +func (m Method) Truncate(s string, length int, tail string) string { if sw := StringWidth(s); sw <= length { return s } @@ -41,6 +50,9 @@ func Truncate(s string, length int, tail string) string { // This action happens when we transition to the Utf8State. var width int cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1) + if m == WcWidth { + width = wcwidth.StringWidth(string(cluster)) + } // increment the index by the length of the cluster i += len(cluster) diff --git a/ansi/width.go b/ansi/width.go index 80890e42..f39ffe88 100644 --- a/ansi/width.go +++ b/ansi/width.go @@ -4,6 +4,7 @@ import ( "bytes" "github.com/charmbracelet/x/ansi/parser" + "github.com/charmbracelet/x/wcwidth" "github.com/rivo/uniseg" ) @@ -63,6 +64,14 @@ func Strip(s string) string { // codes are ignored and wide characters (such as East Asians and emojis) are // accounted for. func StringWidth(s string) int { + return GraphemeWidth.StringWidth(s) +} + +// StringWidth returns the width of a string in cells. This is the number of +// cells that the string will occupy when printed in a terminal. ANSI escape +// codes are ignored and wide characters (such as East Asians and emojis) are +// accounted for. +func (m Method) StringWidth(s string) int { if s == "" { return 0 } @@ -78,7 +87,12 @@ func StringWidth(s string) int { if state == parser.Utf8State { var w int cluster, _, w, _ = uniseg.FirstGraphemeClusterInString(s[i:], -1) - width += w + switch m { + case WcWidth: + width += wcwidth.StringWidth(cluster) + case GraphemeWidth: + width += w + } i += len(cluster) - 1 pstate = parser.GroundState continue diff --git a/ansi/width_test.go b/ansi/width_test.go index 2c41bc7b..4822466f 100644 --- a/ansi/width_test.go +++ b/ansi/width_test.go @@ -47,7 +47,7 @@ func TestStrip(t *testing.T) { func TestStringWidth(t *testing.T) { for i, c := range cases { t.Run(c.name, func(t *testing.T) { - if width := StringWidth(c.input); width != c.width { + if width := GraphemeWidth.StringWidth(c.input); width != c.width { t.Errorf("test case %d failed: expected %d, got %d", i+1, c.width, width) } }) diff --git a/ansi/wrap.go b/ansi/wrap.go index d080a77a..7144bad4 100644 --- a/ansi/wrap.go +++ b/ansi/wrap.go @@ -6,6 +6,7 @@ import ( "unicode/utf8" "github.com/charmbracelet/x/ansi/parser" + "github.com/charmbracelet/x/wcwidth" "github.com/rivo/uniseg" ) @@ -18,6 +19,15 @@ const nbsp = 0xA0 // When preserveSpace is true, spaces at the beginning of a line will be // preserved. func Hardwrap(s string, limit int, preserveSpace bool) string { + return GraphemeWidth.Hardwrap(s, limit, preserveSpace) +} + +// Hardwrap wraps a string or a block of text to a given line length, breaking +// word boundaries. This will preserve ANSI escape codes and will account for +// wide-characters in the string. +// When preserveSpace is true, spaces at the beginning of a line will be +// preserved. +func (m Method) Hardwrap(s string, limit int, preserveSpace bool) string { if limit < 1 { return s } @@ -56,7 +66,12 @@ func Hardwrap(s string, limit int, preserveSpace bool) string { } buf.Write(cluster) - curWidth += width + switch m { + case WcWidth: + curWidth += wcwidth.StringWidth(string(cluster)) + case GraphemeWidth: + curWidth += width + } pstate = parser.GroundState continue } @@ -107,6 +122,18 @@ func Hardwrap(s string, limit int, preserveSpace bool) string { // // Note: breakpoints must be a string of 1-cell wide rune characters. func Wordwrap(s string, limit int, breakpoints string) string { + return GraphemeWidth.Wordwrap(s, limit, breakpoints) +} + +// Wordwrap wraps a string or a block of text to a given line length, not +// breaking word boundaries. This will preserve ANSI escape codes and will +// account for wide-characters in the string. +// The breakpoints string is a list of characters that are considered +// breakpoints for word wrapping. A hyphen (-) is always considered a +// breakpoint. +// +// Note: breakpoints must be a string of 1-cell wide rune characters. +func (m Method) Wordwrap(s string, limit int, breakpoints string) string { if limit < 1 { return s } @@ -165,7 +192,12 @@ func Wordwrap(s string, limit int, breakpoints string) string { curWidth++ } else { word.Write(cluster) - wordLen += width + switch m { + case WcWidth: + wordLen += wcwidth.StringWidth(string(cluster)) + case GraphemeWidth: + wordLen += width + } if curWidth+space.Len()+wordLen > limit && wordLen < limit { addNewline() @@ -235,6 +267,17 @@ func Wordwrap(s string, limit int, breakpoints string) string { // // Note: breakpoints must be a string of 1-cell wide rune characters. func Wrap(s string, limit int, breakpoints string) string { + return GraphemeWidth.Wrap(s, limit, breakpoints) +} + +// Wrap wraps a string or a block of text to a given line length, breaking word +// boundaries if necessary. This will preserve ANSI escape codes and will +// account for wide-characters in the string. The breakpoints string is a list +// of characters that are considered breakpoints for word wrapping. A hyphen +// (-) is always considered a breakpoint. +// +// Note: breakpoints must be a string of 1-cell wide rune characters. +func (m Method) Wrap(s string, limit int, breakpoints string) string { if limit < 1 { return s } @@ -280,6 +323,9 @@ func Wrap(s string, limit int, breakpoints string) string { if state == parser.Utf8State { var width int cluster, _, width, _ = uniseg.FirstGraphemeCluster(b[i:], -1) + if m == WcWidth { + width = wcwidth.StringWidth(string(cluster)) + } i += len(cluster) r, _ := utf8.DecodeRune(cluster) diff --git a/cellbuf/grid.go b/cellbuf/grid.go index 8a5d212e..13de1114 100644 --- a/cellbuf/grid.go +++ b/cellbuf/grid.go @@ -3,7 +3,6 @@ package cellbuf import ( "bytes" "strings" - "unicode/utf8" "github.com/charmbracelet/x/ansi" ) @@ -34,13 +33,13 @@ type Grid interface { // SetContentAt writes the given data to the grid starting from the given // position and with the given width and height. -func (m WidthMethod) SetContentAt(b Grid, c string, x, y, w, h int) []int { - return setContent(b, c, x, y, w, h, m, strings.ReplaceAll, utf8.DecodeRuneInString) +func SetContentAt(m ansi.Method, b Grid, c string, x, y, w, h int) []int { + return setContent(b, c, x, y, w, h, m) } // SetContent writes the given data to the grid starting from the first cell. -func (m WidthMethod) SetContent(g Grid, content string) []int { - return m.SetContentAt(g, content, 0, 0, g.Width(), Height(content)) +func SetContent(m ansi.Method, g Grid, content string) []int { + return SetContentAt(m, g, content, 0, 0, g.Width(), Height(content)) } // Render returns a string representation of the grid with ANSI escape sequences. diff --git a/cellbuf/grid_write.go b/cellbuf/grid_write.go index eef10453..fd0e0a9b 100644 --- a/cellbuf/grid_write.go +++ b/cellbuf/grid_write.go @@ -2,26 +2,19 @@ package cellbuf import ( "bytes" - "unicode/utf8" + "strings" "github.com/charmbracelet/x/ansi" - "github.com/charmbracelet/x/wcwidth" ) // setContent writes the given data to the buffer starting from the first cell. // It accepts both string and []byte data types. -func setContent[ - T string | []byte, - TReplaceAllFunc func(s T, old T, new T) T, //nolint:predeclared - TDecodeRuneFunc func(p T) (rune, int), -]( +func setContent( buf Grid, - data T, + data string, x, y int, w, h int, - method WidthMethod, - replaceAll TReplaceAllFunc, - decodeRune TDecodeRuneFunc, + method ansi.Method, ) []int { var cell Cell var pen Style @@ -30,7 +23,7 @@ func setContent[ p := ansi.GetParser() defer ansi.PutParser(p) - data = replaceAll(data, T("\r\n"), T("\n")) + data = strings.ReplaceAll(data, "\r\n", "\n") // linew is a slice of line widths. We use this to keep track of the // written widths of each line. We use this information later to optimize @@ -41,23 +34,10 @@ func setContent[ var state byte for len(data) > 0 { - seq, width, n, newState := ansi.DecodeSequence(data, state, p) + seq, width, n, newState := method.DecodeSequenceInString(data, state, p) switch width { case 2, 3, 4: // wide cells can go up to 4 cells wide - - switch method { - case WcWidth: - if r, rw := decodeRune(data); r != utf8.RuneError { - n = rw - width = wcwidth.RuneWidth(r) - seq = T(string(r)) - newState = 0 - } - case GraphemeWidth: - // [ansi.DecodeSequence] already handles grapheme clusters - } - // Mark wide cells with emptyCell zero width // We set the wide cell down below for j := 1; j < width; j++ { @@ -227,7 +207,7 @@ func setContent[ link.URLID = id link.URL = string(params[2]) } - case ansi.Equal(seq, T("\n")): + case ansi.Equal(seq, "\n"): // Reset the rest of the line for x < w { buf.Set(x, y, spaceCell) //nolint:errcheck diff --git a/cellbuf/method.go b/cellbuf/method.go deleted file mode 100644 index 8bc7de2d..00000000 --- a/cellbuf/method.go +++ /dev/null @@ -1,11 +0,0 @@ -package cellbuf - -// WidthMethod is a type that represents the how the renderer should calculate -// the display width of cells. -type WidthMethod uint8 - -// Display width modes. -const ( - WcWidth WidthMethod = iota - GraphemeWidth -)