-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenize.go
35 lines (29 loc) · 940 Bytes
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
package gosim
import (
"strings"
"unicode"
)
// Function definition for transforming unstructured document text into a list
// of tokens.
type Tokenize func(text string) []string
func MakeDefaultTokenizer() Tokenize {
return func(text string) []string {
// Pass 1: Split the string into "coarse" tokens
tokens := strings.FieldsFunc(text, func(c rune) bool {
return !(unicode.IsLetter(c) || unicode.IsNumber(c) || c == '\'' || c == '-')
})
// Pass 2: case-fold and trim non-alphanumeric characters.
filteredTokens := make([]string, 0, len(tokens))
for _, token := range tokens {
token = strings.ToLower(token) // case folding
token = strings.TrimFunc(token, func(c rune) bool {
return !(unicode.IsLetter(c) || unicode.IsNumber(c))
})
// Discard single-character tokens while we're at it.
if len(token) >= 2 {
filteredTokens = append(filteredTokens, token)
}
}
return filteredTokens
}
}