-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathtsvector.go
371 lines (342 loc) · 11.2 KB
/
tsvector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package tsearch
import (
"fmt"
"sort"
"strconv"
"strings"
"unicode"
"github.com/blevesearch/snowballstem"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/errors"
)
// This file defines the TSVector data structure, which is used to implement
// Postgres's tsvector text search mechanism.
// See https://www.postgresql.org/docs/current/datatype-textsearch.html for
// context on what each of the pieces do.
//
// TSVector is ultimately used to represent a document as a posting list - a
// list of lexemes in the doc (typically without stop words like common or short
// words, and typically stemmed using a stemming algorithm like snowball
// stemming (https://snowballstem.org/)), along with an associated set of
// positions that those lexemes occur within the document.
//
// Typically, this posting list is then stored in an inverted index within the
// database to accelerate searches of terms within the document.
//
// The key structures are:
// - tsTerm is a document term (also referred to as lexeme) along with a
// position list, which contains the positions within a document that a term
// appeared, along with an optional weight, which controls matching.
// - tsTerm is also used during parsing of both TSQueries and TSVectors, so a
// tsTerm also can represent an TSQuery operator.
// - tsWeight represents the weight of a given lexeme. It's also used for
// queries, when a "star" weight is available that matches any weight.
// - TSVector is a list of tsTerms, ordered by their lexeme.
// tsWeight is a bitfield that represents the weight of a given term. When
// stored in a TSVector, only 1 of the bits will be set. The default weight is
// D - as a result, we store 0 for the weight of terms with weight D or no
// specified weight. The weightStar value is never set in a TSVector weight.
//
// tsWeight is also used inside of TSQueries, to specify the weight to search.
// Within TSQueries, the absence of a weight is the default, and indicates that
// the search term should match any matching term, regardless of its weight. If
// one or more of the weights are set in a search term, it indicates that the
// query should match only terms with the given weights.
type tsWeight byte
const (
// These enum values are a bitfield and must be kept in order.
weightD tsWeight = 1 << iota
weightC
weightB
weightA
// weightStar is a special "weight" that can be specified only in a search
// term. It indicates prefix matching, which will allow the term to match any
// document term that begins with the search term.
weightStar
invalidWeight
weightAny = weightA | weightB | weightC | weightD
)
func (w tsWeight) String() string {
var ret strings.Builder
if w&weightStar != 0 {
ret.WriteByte('*')
}
if w&weightA != 0 {
ret.WriteByte('A')
}
if w&weightB != 0 {
ret.WriteByte('B')
}
if w&weightC != 0 {
ret.WriteByte('C')
}
if w&weightD != 0 {
ret.WriteByte('D')
}
return ret.String()
}
// TSVectorPGEncoding returns the PG-compatible wire protocol encoding for a
// given weight. Note that this is only allowable for TSVector tsweights, which
// can't have more than one weight set at the same time. In a TSQuery, you might
// have more than one weight per lexeme, which is not encodable using this
// scheme.
func (w tsWeight) TSVectorPGEncoding() (byte, error) {
switch w {
case weightA:
return 3, nil
case weightB:
return 2, nil
case weightC:
return 1, nil
case weightD, 0:
return 0, nil
}
return 0, errors.Errorf("invalid tsvector weight %d", w)
}
// matches returns true if the receiver is matched by the input tsquery weight.
func (w tsWeight) matches(queryWeight tsWeight) bool {
if queryWeight == weightAny {
return true
}
if w&queryWeight > 0 {
return true
}
// If we're querying for D, and the receiver has no weight, that's also a
// match.
return queryWeight&weightD > 0 && w == 0
}
func tsWeightFromVectorPGEncoding(b byte) (tsWeight, error) {
switch b {
case 3:
return weightA, nil
case 2:
return weightB, nil
case 1:
return weightC, nil
case 0:
// We don't explicitly return weightD, since it's the default.
return 0, nil
}
return 0, errors.Errorf("invalid encoded tsvector weight %d", b)
}
// tsPosition is a position within a document, along with an optional weight.
type tsPosition struct {
position uint16
weight tsWeight
}
// tsTerm is either a lexeme and position list, or an operator (when parsing a
// a TSQuery).
type tsTerm struct {
// lexeme is at most 2046 characters.
lexeme string
positions []tsPosition
// The operator and followedN fields are only used when parsing a TSQuery.
operator tsOperator
// Set only when operator = followedby
// At most 16384.
followedN uint16
}
func newLexemeTerm(lexeme string) (tsTerm, error) {
if len(lexeme) > 2046 {
return tsTerm{}, pgerror.Newf(pgcode.ProgramLimitExceeded, "word is too long (%d bytes, max 2046 bytes)", len(lexeme))
}
return tsTerm{lexeme: lexeme}, nil
}
func (t tsTerm) String() string {
if t.operator != 0 {
switch t.operator {
case and:
return "&"
case or:
return "|"
case not:
return "!"
case lparen:
return "("
case rparen:
return ")"
case followedby:
if t.followedN == 1 {
return "<->"
}
return fmt.Sprintf("<%d>", t.followedN)
}
}
var buf strings.Builder
buf.WriteByte('\'')
for _, r := range t.lexeme {
if r == '\'' {
// Single quotes are escaped as double single quotes inside of a TSVector.
buf.WriteString(`''`)
} else {
buf.WriteRune(r)
}
}
buf.WriteByte('\'')
for i, pos := range t.positions {
if i > 0 {
buf.WriteByte(',')
} else {
buf.WriteByte(':')
}
if pos.position > 0 {
buf.WriteString(strconv.Itoa(int(pos.position)))
}
buf.WriteString(pos.weight.String())
}
return buf.String()
}
func (t tsTerm) matchesWeight(targetWeight tsWeight) bool {
if targetWeight == weightAny {
return true
}
if len(t.positions) == 0 {
// A "stripped" tsvector (no associated positions) always matches any input
// weight.
return true
}
for _, pos := range t.positions {
if pos.weight.matches(targetWeight) {
return true
}
}
return false
}
// TSVector is a sorted list of terms, each of which is a lexeme that might have
// an associated position within an original document.
type TSVector []tsTerm
func (t TSVector) String() string {
var buf strings.Builder
for i, term := range t {
if i > 0 {
buf.WriteByte(' ')
}
buf.WriteString(term.String())
}
return buf.String()
}
// ParseTSVector produces a TSVector from an input string. The input will be
// sorted by lexeme, but will not be automatically stemmed or stop-worded.
func ParseTSVector(input string) (TSVector, error) {
parser := tsVectorLexer{
input: input,
state: expectingTerm,
}
ret, err := parser.lex()
if err != nil {
return ret, err
}
return normalizeTSVector(ret)
}
func normalizeTSVector(ret TSVector) (TSVector, error) {
if len(ret) > 1 {
// Sort and de-duplicate the resultant TSVector.
sort.Slice(ret, func(i, j int) bool {
return ret[i].lexeme < ret[j].lexeme
})
// Then distinct: (wouldn't it be nice if Go had generics?)
lastUniqueIdx := 0
for j := 1; j < len(ret); j++ {
if ret[j].lexeme != ret[lastUniqueIdx].lexeme {
// We found a unique entry, at index i. The last unique entry in the
// array was at lastUniqueIdx, so set the entry after that one to our
// new unique entry, and bump lastUniqueIdx for the next loop iteration.
// First, sort and unique the position list now that we've collapsed all
// of the identical lexemes.
ret[lastUniqueIdx].positions = sortAndUniqTSPositions(ret[lastUniqueIdx].positions)
lastUniqueIdx++
ret[lastUniqueIdx] = ret[j]
} else {
// The last entries were not unique. Collapse their positions into the
// first entry's list.
ret[lastUniqueIdx].positions = append(ret[lastUniqueIdx].positions, ret[j].positions...)
}
}
ret = ret[:lastUniqueIdx+1]
}
if len(ret) >= 1 {
// Make sure to sort and uniq the position list even if there's only 1
// entry.
lastIdx := len(ret) - 1
ret[lastIdx].positions = sortAndUniqTSPositions(ret[lastIdx].positions)
}
return ret, nil
}
var validCharTables = []*unicode.RangeTable{unicode.Letter, unicode.Number}
// TSParse is the function that splits an input text into a list of
// tokens. For now, the parser that we use is very simple: it merely lowercases
// the input and splits it into tokens based on assuming that non-letter,
// non-number characters are whitespace.
//
// The Postgres text search parser is much, much more sophisticated. The
// documentation (https://www.postgresql.org/docs/current/textsearch-parsers.html)
// gives more information, but roughly, each token is categorized into one of
// about 20 different buckets, such as asciiword, url, email, host, float, int,
// version, tag, etc. It uses very specific rules to produce these outputs.
// Another interesting transformation is returning multiple tokens for a
// hyphenated word, including a token that represents the entire hyphenated word,
// as well as one for each of the hyphenated components.
//
// It's not clear whether we need to exactly mimic this functionality. Likely,
// we will eventually want to do this.
func TSParse(input string) []string {
return strings.FieldsFunc(input, func(r rune) bool {
return !unicode.IsOneOf(validCharTables, r)
})
}
// TSLexize implements the "dictionary" construct that's exposed via ts_lexize.
// It gets invoked once per input token to produce an output lexeme during
// routines like to_tsvector and to_tsquery.
// It can return false in the second parameter to indicate a stopword was found.
func TSLexize(config string, token string) (lexeme string, ok bool, err error) {
stopwords, ok := stopwordsMap[config]
if !ok {
return "", false, pgerror.Newf(pgcode.UndefinedObject, "text search configuration %q does not exist", config)
}
lower := strings.ToLower(token)
if _, ok := stopwords[lower]; ok {
return "", false, nil
}
stemmer, err := getStemmer(config)
if err != nil {
return "", false, err
}
env := snowballstem.NewEnv(lower)
stemmer(env)
return env.Current(), true, nil
}
// DocumentToTSVector parses an input document into lexemes, removes stop words,
// stems and normalizes the lexemes, and returns a TSVector annotated with
// lexeme positions according to a text search configuration passed by name.
func DocumentToTSVector(config string, input string) (TSVector, error) {
tokens := TSParse(input)
vector := make(TSVector, 0, len(tokens))
for i := range tokens {
lexeme, ok, err := TSLexize(config, tokens[i])
if err != nil {
return nil, err
}
if !ok {
continue
}
term := tsTerm{lexeme: lexeme}
term.lexeme = lexeme
pos := i + 1
if i > maxTSVectorPosition {
pos = maxTSVectorPosition
}
term.positions = []tsPosition{{position: uint16(pos)}}
vector = append(vector, term)
}
return normalizeTSVector(vector)
}