Skip to content

Commit

Permalink
feat(chsql): add string search token iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
tdakkota committed Jul 16, 2024
1 parent ddd413c commit c804add
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 2 deletions.
35 changes: 33 additions & 2 deletions internal/chstorage/chsql/token.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package chsql

// IsToken whether if given string is a single token.
// IsSingleToken whether if given string is a single token.
//
// See https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions#hastoken.
// See https://github.com/ClickHouse/ClickHouse/blob/755b73f3fc99847f40ac4d9186bb19116e709c37/src/Interpreters/ITokenExtractor.cpp#L84.
Expand All @@ -11,13 +11,44 @@ func IsSingleToken[S ~string | ~[]byte](s S) bool {
// If string does contain any non-alphanumeric ASCII characters.
// then it is not a single token.
for _, c := range []byte(s) {
if c < 0x80 && !isAlphaNumeric(c) {
if isTokenSeparator(c) {
return false
}
}
return true
}

// CollectTokens iterates over tokens in given string.
func CollectTokens[S ~string | ~[]byte](s S, cb func(s S) bool) {
// FIXME(tdakkota): use go1.23 iterators.
if len(s) == 0 {
return
}
// If string does contain any non-alphanumeric ASCII characters.
// then it is not a single token.
var (
i, lastIdx int
c byte
)
for i, c = range []byte(s) {
if !isTokenSeparator(c) {
continue
}
tok := s[lastIdx:i]
if len(tok) > 0 && !cb(tok) {
return
}
lastIdx = i + 1
}
if tok := s[lastIdx:]; len(tok) > 0 {
cb(s[lastIdx:])
}
}

func isTokenSeparator(c byte) bool {
return c < 0x80 && !isAlphaNumeric(c)
}

func isAlphaNumeric(c byte) bool {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
Expand Down
49 changes: 49 additions & 0 deletions internal/chstorage/chsql/token_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,52 @@ func TestIsSingleToken(t *testing.T) {
})
}
}

func TestCollectTokens(t *testing.T) {
tests := []struct {
s string
want []string
}{
{``, nil},
{` `, nil},
{`10`, []string{"10"}},
{` 10 `, []string{"10"}},
{`abc`, []string{"abc"}},
{`помидоры abc огурцы`, []string{"помидоры", "abc", "огурцы"}},
{`"error": "ENOENT"`, []string{"error", "ENOENT"}},
{
`{"msg": "Request", "error": "invalid data"}`,
[]string{"msg", "Request", "error", "invalid", "data"},
},
}
for i, tt := range tests {
tt := tt
t.Run(fmt.Sprintf("Test%d", i+1), func(t *testing.T) {
var got []string
CollectTokens(tt.s, func(tok string) bool {
got = append(got, tok)
return true
})
require.Equal(t, tt.want, got)
})
}
}

func FuzzCollectTokens(f *testing.F) {
for _, s := range []string{
`помидоры abc огурцы`,
`{"msg": "Request", "error": "invalid data"}`,
} {
f.Add(s)
}
f.Fuzz(func(t *testing.T, input string) {
defer func() {
if r := recover(); r != nil || t.Failed() {
t.Logf("Input: %#q", input)
}
}()
CollectTokens(input, func(tok string) bool {
return true
})
})
}

0 comments on commit c804add

Please sign in to comment.