Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

builtins: add to_tsvector {phrase,plain,}to_tsquery #92966

Merged
merged 1 commit into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -883,6 +883,23 @@ available replica will error.</p>
</span></td><td>Immutable</td></tr></tbody>
</table>

### Full Text Search functions

<table>
<thead><tr><th>Function &rarr; Returns</th><th>Description</th><th>Volatility</th></tr></thead>
<tbody>
<tr><td><a name="phraseto_tsquery"></a><code>phraseto_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts text to a tsquery, normalizing words according to the specified or default configuration. The &lt;-&gt; operator is inserted between each token in the input.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="plainto_tsquery"></a><code>plainto_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts text to a tsquery, normalizing words according to the specified or default configuration. The &amp; operator is inserted between each token in the input.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="to_tsquery"></a><code>to_tsquery(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsquery</code></td><td><span class="funcdesc"><p>Converts the input text into a tsquery by normalizing each word in the input according to the specified or default configuration. The input must already be formatted like a tsquery, in other words, subsequent tokens must be connected by a tsquery operator (&amp;, |, &lt;-&gt;, !).</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="to_tsvector"></a><code>to_tsvector(config: <a href="string.html">string</a>, text: <a href="string.html">string</a>) &rarr; tsvector</code></td><td><span class="funcdesc"><p>Converts text to a tsvector, normalizing words according to the specified or default configuration. Position information is included in the result.</p>
</span></td><td>Immutable</td></tr>
<tr><td><a name="ts_parse"></a><code>ts_parse(parser_name: <a href="string.html">string</a>, document: <a href="string.html">string</a>) &rarr; tuple{int AS tokid, string AS token}</code></td><td><span class="funcdesc"><p>ts_parse parses the given document and returns a series of records, one for each token produced by parsing. Each record includes a tokid showing the assigned token type and a token which is the text of the token.</p>
</span></td><td>Stable</td></tr></tbody>
</table>

### Fuzzy String Matching functions

<table>
Expand Down
35 changes: 35 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/tsvector
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,38 @@ EXPLAIN SELECT * FROM a@a_a_idx WHERE a @@ b
statement ok
CREATE TABLE t95680 (c1 FLOAT NOT NULL, c2 TSVECTOR NOT NULL, INVERTED INDEX (c1 ASC, c2 ASC));
INSERT INTO t95680 VALUES (1.0::FLOAT, e'\'kCrLZNl\' \'sVDj\' \'yO\' \'z\':54C,440B,519C,794B':::TSVECTOR);

# More tests for these functions live in pkg/util/tsearch/testdata
query IT
SELECT * FROM ts_parse('default', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
1 Hello
1 this
1 is
1 a
1 parsi
1 ng
1 t
1 est
1 1
1 234
1 4
1 case324

query T
SELECT * FROM to_tsvector('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
'1':9 '234':10 '4':11 'a':4 'case324':12 'est':8 'hello':1 'is':3 'ng':6 'parsi':5 't':7 'this':2

query T
SELECT * FROM phraseto_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
----
'hello' <-> 'this' <-> 'is' <-> 'a' <-> 'parsi' <-> 'ng' <-> 't' <-> 'est' <-> '1' <-> '234' <-> '4' <-> 'case324'

query T
SELECT * FROM to_tsquery('simple', 'a | b & c <-> d')
----
'a' | 'b' & 'c' <-> 'd'

query error syntax
SELECT * FROM to_tsquery('simple', 'Hello this is a parsi-ng t.est 1.234 4 case324')
2 changes: 2 additions & 0 deletions pkg/sql/sem/builtins/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_library(
"show_create_all_tables_builtin.go",
"show_create_all_types_builtin.go",
"trigram_builtins.go",
"tsearch_builtins.go",
"window_builtins.go",
"window_frame_builtins.go",
],
Expand Down Expand Up @@ -126,6 +127,7 @@ go_library(
"//pkg/util/tracing",
"//pkg/util/tracing/tracingpb",
"//pkg/util/trigram",
"//pkg/util/tsearch",
"//pkg/util/ulid",
"//pkg/util/unaccent",
"//pkg/util/uuid",
Expand Down
4 changes: 0 additions & 4 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -3791,13 +3791,9 @@ value if you rely on the HLC for accuracy.`,
"array_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"get_current_ts_config": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"numnode": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"plainto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"phraseto_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"querytree": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"setweight": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"strip": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"to_tsquery": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"json_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"jsonb_to_tsvector": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
"ts_delete": makeBuiltin(tree.FunctionProperties{UnsupportedWithIssue: 7821, Category: builtinconstants.CategoryFullTextSearch}),
Expand Down
5 changes: 5 additions & 0 deletions pkg/sql/sem/builtins/fixed_oids.go
Original file line number Diff line number Diff line change
Expand Up @@ -2045,6 +2045,11 @@ var builtinOidsArray = []string{
2069: `crdb_internal.create_tenant(parameters: jsonb) -> int`,
2070: `crdb_internal.num_inverted_index_entries(val: tsvector, version: int) -> int`,
2072: `crdb_internal.upsert_dropped_relation_gc_ttl(desc_id: int, gc_ttl: interval) -> bool`,
2073: `to_tsquery(config: string, text: string) -> tsquery`,
2074: `to_tsvector(config: string, text: string) -> tsvector`,
2075: `phraseto_tsquery(config: string, text: string) -> tsquery`,
2076: `plainto_tsquery(config: string, text: string) -> tsquery`,
2077: `ts_parse(parser_name: string, document: string) -> tuple{int AS tokid, string AS token}`,
}

var builtinOidsBySignature map[string]oid.Oid
Expand Down
169 changes: 169 additions & 0 deletions pkg/sql/sem/builtins/tsearch_builtins.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package builtins

import (
"context"

"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins/builtinconstants"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sem/volatility"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/tsearch"
)

func init() {
for k, v := range tsearchBuiltins {
v.props.Category = builtinconstants.CategoryFullTextSearch
v.props.AvailableOnPublicSchema = true
registerBuiltin(k, v)
}
}

type tsParseGenerator struct {
input string
tokens []string
nextToken string
}

func (t tsParseGenerator) ResolvedType() *types.T {
return tsParseType
}

func (t *tsParseGenerator) Start(_ context.Context, _ *kv.Txn) error {
t.tokens = tsearch.TSParse(t.input)
return nil
}

func (t *tsParseGenerator) Next(_ context.Context) (bool, error) {
if len(t.tokens) == 0 {
return false, nil
}
t.nextToken, t.tokens = t.tokens[0], t.tokens[1:]
return true, nil
}

func (t tsParseGenerator) Values() (tree.Datums, error) {
return tree.Datums{tree.NewDInt(1), tree.NewDString(t.nextToken)}, nil
}

func (t tsParseGenerator) Close(_ context.Context) {}

var tsParseType = types.MakeLabeledTuple(
[]*types.T{types.Int, types.String},
[]string{"tokid", "token"},
)

var tsearchBuiltins = map[string]builtinDefinition{
"ts_parse": makeBuiltin(genProps(),
makeGeneratorOverload(
tree.ParamTypes{{Name: "parser_name", Typ: types.String}, {Name: "document", Typ: types.String}},
types.MakeLabeledTuple(
[]*types.T{types.Int, types.String},
[]string{"tokid", "token"},
),
func(_ context.Context, _ *eval.Context, args tree.Datums) (eval.ValueGenerator, error) {
parserName := string(tree.MustBeDString(args[0]))
if parserName != "default" {
return nil, pgerror.Newf(pgcode.UndefinedObject, "text search parser %q does not exist", parserName)
}
return &tsParseGenerator{input: string(tree.MustBeDString(args[1]))}, nil
},
"ts_parse parses the given document and returns a series of records, "+
"one for each token produced by parsing. "+
"Each record includes a tokid showing the assigned token type and a token which is the text of the token.",
volatility.Stable,
),
),
// Full text search functions.
"to_tsvector": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSVector),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
// Parse, stem, and stopword the input.
config := string(tree.MustBeDString(args[0]))
document := string(tree.MustBeDString(args[1]))
vector, err := tsearch.DocumentToTSVector(config, document)
if err != nil {
return nil, err
}
return &tree.DTSVector{TSVector: vector}, nil
},
Info: "Converts text to a tsvector, normalizing words according to the specified or default configuration. " +
"Position information is included in the result.",
Volatility: volatility.Immutable,
},
),
"to_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.ToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts the input text into a tsquery by normalizing each word in the input according to " +
"the specified or default configuration. The input must already be formatted like a tsquery, in other words, " +
"subsequent tokens must be connected by a tsquery operator (&, |, <->, !).",
Volatility: volatility.Immutable,
},
),
"plainto_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.PlainToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." +
" The & operator is inserted between each token in the input.",
Volatility: volatility.Immutable,
},
),
"phraseto_tsquery": makeBuiltin(
tree.FunctionProperties{},
tree.Overload{
Types: tree.ParamTypes{{Name: "config", Typ: types.String}, {Name: "text", Typ: types.String}},
ReturnType: tree.FixedReturnType(types.TSQuery),
Fn: func(_ context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) {
config := string(tree.MustBeDString(args[0]))
input := string(tree.MustBeDString(args[1]))
query, err := tsearch.PhraseToTSQuery(config, input)
if err != nil {
return nil, err
}
return &tree.DTSQuery{TSQuery: query}, nil
},
Info: "Converts text to a tsquery, normalizing words according to the specified or default configuration." +
" The <-> operator is inserted between each token in the input.",
Volatility: volatility.Immutable,
},
),
}
45 changes: 45 additions & 0 deletions pkg/sql/sem/eval/testdata/eval/tsearch
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,48 @@ eval
'bar:3 baz:5'::tsvector @@ 'baz <2> bar'::tsquery
----
false

eval
to_tsvector('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.')
----
'and':12 'events':9 'find':15 'have':2 'mind':6 'not':7 'outside':8 'over':4 'power':3 'realize':10 'strength':16 'this':11 'will':14 'you':1,13 'your':5

eval
to_tsquery('simple', 'hello')
----
'hello'

eval
to_tsquery('simple', 'hello | there')
----
'hello' | 'there'

eval
to_tsquery('simple', 'hello | the#re')
----
'hello' | 'the' <-> 're'

eval
plainto_tsquery('simple', 'hello there')
----
'hello' & 'there'

eval
plainto_tsquery('simple', 'hello the#re')
----
'hello' & 'the' & 're'

eval
phraseto_tsquery('simple', 'You have power over your mind – not outside events. Realize this, and you will find strength.')
----
'you' <-> 'have' <-> 'power' <-> 'over' <-> 'your' <-> 'mind' <-> 'not' <-> 'outside' <-> 'events' <-> 'realize' <-> 'this' <-> 'and' <-> 'you' <-> 'will' <-> 'find' <-> 'strength'

eval
phraseto_tsquery('simple', 'hello there')
----
'hello' <-> 'there'

eval
phraseto_tsquery('simple', 'hello the#re')
----
'hello' <-> 'the' <-> 're'
Loading