Skip to content

Commit

Permalink
feat(stdlib): add unicode/utf16 pacakge (#1764)
Browse files Browse the repository at this point in the history
# Description

- Add `unicode/utf16` package. transffered directly from Go without any
changes.
- register to the `stdlibWhitelist` transpiler.go

In an earlier JSON PR #1415 , I included this `unicode/utf16` to handle
unescaping and other byte slice operations, but realized that I wasn't
using it in that package, leading me to submit a separate PR
sepcifically for this.
  • Loading branch information
notJoon authored Mar 29, 2024
1 parent 9ad63e1 commit 1bc60fa
Show file tree
Hide file tree
Showing 4 changed files with 348 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/reference/go-gno-compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ Legend:
| time | `full`[^7] |
| time/tzdata | `tbd` |
| unicode | `full` |
| unicode/utf16 | `tbd` |
| unicode/utf16 | `full` |
| unicode/utf8 | `full` |
| unsafe | `nondet` |

Expand Down
1 change: 1 addition & 0 deletions gnovm/pkg/transpiler/transpiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ var stdlibWhitelist = []string{
"time",
"unicode",
"unicode/utf8",
"unicode/utf16",

// gno
"std",
Expand Down
125 changes: 125 additions & 0 deletions gnovm/stdlibs/unicode/utf16/utf16.gno
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package utf16 implements encoding and decoding of UTF-16 sequences.
package utf16

// The conditions replacementChar==unicode.ReplacementChar and
// maxRune==unicode.MaxRune are verified in the tests.
// Defining them locally avoids this package depending on package unicode.

const (
replacementChar = '\uFFFD' // Unicode replacement character
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
)

const (
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
// the value is those 20 bits plus 0x10000.
surr1 = 0xd800
surr2 = 0xdc00
surr3 = 0xe000

surrSelf = 0x10000
)

// IsSurrogate reports whether the specified Unicode code point
// can appear in a surrogate pair.
func IsSurrogate(r rune) bool {
return surr1 <= r && r < surr3
}

// DecodeRune returns the UTF-16 decoding of a surrogate pair.
// If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
// the Unicode replacement code point U+FFFD.
func DecodeRune(r1, r2 rune) rune {
if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
return (r1-surr1)<<10 | (r2 - surr2) + surrSelf
}
return replacementChar
}

// EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
// If the rune is not a valid Unicode code point or does not need encoding,
// EncodeRune returns U+FFFD, U+FFFD.
func EncodeRune(r rune) (r1, r2 rune) {
if r < surrSelf || r > maxRune {
return replacementChar, replacementChar
}
r -= surrSelf
return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
}

// Encode returns the UTF-16 encoding of the Unicode code point sequence s.
func Encode(s []rune) []uint16 {
n := len(s)
for _, v := range s {
if v >= surrSelf {
n++
}
}

a := make([]uint16, n)
n = 0
for _, v := range s {
switch {
case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
// normal rune
a[n] = uint16(v)
n++
case surrSelf <= v && v <= maxRune:
// needs surrogate sequence
r1, r2 := EncodeRune(v)
a[n] = uint16(r1)
a[n+1] = uint16(r2)
n += 2
default:
a[n] = uint16(replacementChar)
n++
}
}
return a[:n]
}

// AppendRune appends the UTF-16 encoding of the Unicode code point r
// to the end of p and returns the extended buffer. If the rune is not
// a valid Unicode code point, it appends the encoding of U+FFFD.
func AppendRune(a []uint16, r rune) []uint16 {
// This function is inlineable for fast handling of ASCII.
switch {
case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
// normal rune
return append(a, uint16(r))
case surrSelf <= r && r <= maxRune:
// needs surrogate sequence
r1, r2 := EncodeRune(r)
return append(a, uint16(r1), uint16(r2))
}
return append(a, replacementChar)
}

// Decode returns the Unicode code point sequence represented
// by the UTF-16 encoding s.
func Decode(s []uint16) []rune {
a := make([]rune, len(s))
n := 0
for i := 0; i < len(s); i++ {
switch r := s[i]; {
case r < surr1, surr3 <= r:
// normal rune
a[n] = rune(r)
case surr1 <= r && r < surr2 && i+1 < len(s) &&
surr2 <= s[i+1] && s[i+1] < surr3:
// valid surrogate sequence
a[n] = DecodeRune(rune(r), rune(s[i+1]))
i++
default:
// invalid surrogate sequence
a[n] = replacementChar
}
n++
}
return a[:n]
}
221 changes: 221 additions & 0 deletions gnovm/stdlibs/unicode/utf16/utf16_test.gno
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package utf16

import (
"testing"
"unicode"
"unicode/utf16"
)

type encodeTest struct {
in []rune
out []uint16
}

var encodeTests = []encodeTest{
{[]rune{1, 2, 3, 4}, []uint16{1, 2, 3, 4}},
{
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
},
{
[]rune{'a', 'b', 0xd7ff, 0xd800, 0xdfff, 0xe000, 0x110000, -1},
[]uint16{'a', 'b', 0xd7ff, 0xfffd, 0xfffd, 0xe000, 0xfffd, 0xfffd},
},
}

func slicesEqual(a, b []uint16) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

func TestEncode(t *testing.T) {
for _, tt := range encodeTests {
out := Encode(tt.in)
if !slicesEqual(out, tt.out) {
t.Errorf("Encode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}

func TestEncodeRune(t *testing.T) {
for i, tt := range encodeTests {
j := 0
for _, r := range tt.in {
r1, r2 := EncodeRune(r)
if r < 0x10000 || r > unicode.MaxRune {
if j >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want 0xfffd, 0xfffd", r, r1, r2)
}
j++
} else {
if j+1 >= len(tt.out) {
t.Errorf("#%d: ran out of tt.out", i)
break
}
if r1 != rune(tt.out[j]) || r2 != rune(tt.out[j+1]) {
t.Errorf("EncodeRune(%#x) = %#x, %#x; want %#x, %#x", r, r1, r2, tt.out[j], tt.out[j+1])
}
j += 2
dec := DecodeRune(r1, r2)
if dec != r {
t.Errorf("DecodeRune(%#x, %#x) = %#x; want %#x", r1, r2, dec, r)
}
}
}
if j != len(tt.out) {
t.Errorf("#%d: EncodeRune didn't generate enough output", i)
}
}
}

type decodeTest struct {
in []uint16
out []rune
}

var decodeTests = []decodeTest{
{[]uint16{1, 2, 3, 4}, []rune{1, 2, 3, 4}},
{
[]uint16{0xffff, 0xd800, 0xdc00, 0xd800, 0xdc01, 0xd808, 0xdf45, 0xdbff, 0xdfff},
[]rune{0xffff, 0x10000, 0x10001, 0x12345, 0x10ffff},
},
{[]uint16{0xd800, 'a'}, []rune{0xfffd, 'a'}},
{[]uint16{0xdfff}, []rune{0xfffd}},
}

func TestDecode(t *testing.T) {
for _, tt := range decodeTests {
out := Decode(tt.in)
if !runesEqual(out, tt.out) {
t.Errorf("Decode(%x) = %x; want %x", tt.in, out, tt.out)
}
}
}

func runesEqual(a, b []rune) bool {
if len(a) != len(b) {
return false
}
for i, v := range a {
if v != b[i] {
return false
}
}
return true
}

var decodeRuneTests = []struct {
r1, r2 rune
want rune
}{
{0xd800, 0xdc00, 0x10000},
{0xd800, 0xdc01, 0x10001},
{0xd808, 0xdf45, 0x12345},
{0xdbff, 0xdfff, 0x10ffff},
{0xd800, 'a', 0xfffd}, // illegal, replacement rune substituted
}

func TestDecodeRune(t *testing.T) {
for i, tt := range decodeRuneTests {
got := DecodeRune(tt.r1, tt.r2)
if got != tt.want {
t.Errorf("%d: DecodeRune(%q, %q) = %v; want %v", i, tt.r1, tt.r2, got, tt.want)
}
}
}

var surrogateTests = []struct {
r rune
want bool
}{
// from https://en.wikipedia.org/wiki/UTF-16
{'\u007A', false}, // LATIN SMALL LETTER Z
{'\u6C34', false}, // CJK UNIFIED IDEOGRAPH-6C34 (water)
{'\uFEFF', false}, // Byte Order Mark
{'\U00010000', false}, // LINEAR B SYLLABLE B008 A (first non-BMP code point)
{'\U0001D11E', false}, // MUSICAL SYMBOL G CLEF
{'\U0010FFFD', false}, // PRIVATE USE CHARACTER-10FFFD (last Unicode code point)

{rune(0xd7ff), false}, // surr1-1
{rune(0xd800), true}, // surr1
{rune(0xdc00), true}, // surr2
{rune(0xe000), false}, // surr3
{rune(0xdfff), true}, // surr3-1
}

func TestIsSurrogate(t *testing.T) {
for i, tt := range surrogateTests {
got := IsSurrogate(tt.r)
if got != tt.want {
t.Errorf("%d: IsSurrogate(%q) = %v; want %v", i, tt.r, got, tt.want)
}
}
}

func BenchmarkDecodeValidASCII(b *testing.B) {
// "hello world"
data := []uint16{104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100}
for i := 0; i < b.N; i++ {
Decode(data)
}
}

func BenchmarkDecodeValidJapaneseChars(b *testing.B) {
// "日本語日本語日本語"
data := []uint16{26085, 26412, 35486, 26085, 26412, 35486, 26085, 26412, 35486}
for i := 0; i < b.N; i++ {
Decode(data)
}
}

func BenchmarkDecodeRune(b *testing.B) {
rs := make([]rune, 10)
// U+1D4D0 to U+1D4D4: MATHEMATICAL BOLD SCRIPT CAPITAL LETTERS
for i, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
rs[2*i], rs[2*i+1] = EncodeRune(u)
}

b.ResetTimer()
for i := 0; i < b.N; i++ {
for j := 0; j < 5; j++ {
DecodeRune(rs[2*j], rs[2*j+1])
}
}
}

func BenchmarkEncodeValidASCII(b *testing.B) {
data := []rune{'h', 'e', 'l', 'l', 'o'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}

func BenchmarkEncodeValidJapaneseChars(b *testing.B) {
data := []rune{'日', '本', '語'}
for i := 0; i < b.N; i++ {
Encode(data)
}
}

func BenchmarkEncodeRune(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, u := range []rune{'𝓐', '𝓑', '𝓒', '𝓓', '𝓔'} {
EncodeRune(u)
}
}
}

0 comments on commit 1bc60fa

Please sign in to comment.