Skip to content

Commit

Permalink
tsearch: add stemming and stopword elimination
Browse files Browse the repository at this point in the history
This commit adds stopword elimination for text search. The languages
supported are the same ones that Postgres does. The stopword lists were
copied from Postgres commit e757080e041214cf6983e3e77ef01e83f1371d72.

Also, add snowball stemming provided by the blevesearch snowball
stemming library.

Release note (sql change): add stemming and stopword eliminating text
search configurations for English, Danish, Dutch, Finnish, French,
German, Hungarian, Italian, Norwegian, Portuguese, Russian, Spanish,
Swedish, and Turkish.
  • Loading branch information
jordanlewis committed Feb 26, 2023
1 parent 3ebacd6 commit abdc96e
Show file tree
Hide file tree
Showing 25 changed files with 3,249 additions and 20 deletions.
10 changes: 10 additions & 0 deletions DEPS.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,16 @@ def go_deps():
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/blang/semver/com_github_blang_semver-v3.5.1+incompatible.zip",
],
)
go_repository(
name = "com_github_blevesearch_snowballstem",
build_file_proto_mode = "disable_global",
importpath = "github.com/blevesearch/snowballstem",
sha256 = "6640a408ddcec84810873cc678570717c02d5b7b932f37672c44caea33469506",
strip_prefix = "github.com/blevesearch/[email protected]",
urls = [
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/blevesearch/snowballstem/com_github_blevesearch_snowballstem-v0.9.0.zip",
],
)
go_repository(
name = "com_github_bmizerany_assert",
build_file_proto_mode = "disable_global",
Expand Down
1 change: 1 addition & 0 deletions build/bazelutil/distdir_files.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ DISTDIR_FILES = {
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/bkaradzic/go-lz4/com_github_bkaradzic_go_lz4-v1.0.0.zip": "525f5633a4d9c8a32b5b5763c4e423ad061e773cf8cfeb21737f491feb531666",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/bketelsen/crypt/com_github_bketelsen_crypt-v0.0.4.zip": "ab24f8c0386cc7fce86f4e6680c32214e1e597980bd80127ac84e71ace6763da",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/blang/semver/com_github_blang_semver-v3.5.1+incompatible.zip": "8d032399cf835b93f7cf641b5477a31a002059eed7888a775f97bd3e9677ad3c",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/blevesearch/snowballstem/com_github_blevesearch_snowballstem-v0.9.0.zip": "6640a408ddcec84810873cc678570717c02d5b7b932f37672c44caea33469506",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/bmizerany/assert/com_github_bmizerany_assert-v0.0.0-20160611221934-b7ed37b82869.zip": "2532a167df77ade7e8012f07c0e3db4d4c15abdb7ffa7b05e1d961408da9a539",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/bmizerany/pat/com_github_bmizerany_pat-v0.0.0-20170815010413-6226ea591a40.zip": "ed04bed4d193e25371ebc6524984da4af9ece5c107fcc82d5aa4914b726706d2",
"https://storage.googleapis.com/cockroach-godeps/gomod/github.com/bmizerany/perks/com_github_bmizerany_perks-v0.0.0-20141205001514-d9a9656a3a4b.zip": "b78e7083e73b6c2d63a30d073515b2a03dbe3115171601009211208ee0c6046e",
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ require (
github.com/axiomhq/hyperloglog v0.0.0-20181223111420-4b99d0c2c99e
github.com/bazelbuild/rules_go v0.26.0
github.com/biogo/store v0.0.0-20160505134755-913427a1d5e8
github.com/blevesearch/snowballstem v0.9.0
github.com/buchgr/bazel-remote v1.3.3
github.com/bufbuild/buf v0.56.0
github.com/cenkalti/backoff v2.2.1+incompatible
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,8 @@ github.com/bketelsen/crypt v0.0.4/go.mod h1:aI6NrJ0pMGgvZKL1iVgXLnfIFJtfV+bKCoqO
github.com/blang/semver v3.1.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs=
github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4=
github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c=
github.com/bmizerany/perks v0.0.0-20141205001514-d9a9656a3a4b/go.mod h1:ac9efd0D1fsDb3EJvhqgXRbFx7bs2wqZ10HQPeU8U/Q=
Expand Down
239 changes: 239 additions & 0 deletions pkg/sql/sem/eval/testdata/eval/tsearch
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,242 @@ eval
phraseto_tsquery('simple', 'hello the#re')
----
'hello' <-> 'the' <-> 're'

eval
to_tsquery('english', 'qwe & sKies ')
----
'qwe' & 'sky'

eval
to_tsquery('simple', 'qwe & sKies ')
----
'qwe' & 'skies'

eval
to_tsquery('english', '''the wether'':dc & '' sKies '':BC ')
----
'wether':CD & 'sky':BC

eval
to_tsquery('english', 'asd&(and|fghj)')
----
'asd' & 'fghj'

eval
to_tsquery('english', '(asd&and)|fghj')
----
'asd' | 'fghj'

eval
to_tsquery('english', '(asd&!and)|fghj')
----
'asd' | 'fghj'

eval
to_tsquery('english', '(the|and&(i&1))&fghj')
----
'1' & 'fghj'

# Test english stemming and stopword elimination in to_tsvector
eval
to_tsvector('english', 'Here is a sentence with some stop words')
----
'sentenc':4 'stop':7 'word':8

# Test stopword elimination in to_tsquery

eval
to_tsquery('english', '!(a & !b) & c')
----
!!'b' & 'c'

eval
to_tsquery('english', '!(a & !b)')
----
!!'b'

eval
to_tsquery('english', '(1 <-> 2) <-> a')
----
'1' <-> '2'

eval
to_tsquery('english', '(1 <-> a) <-> 2')
----
'1' <2> '2'

eval
to_tsquery('english', '(a <-> 1) <-> 2')
----
'1' <-> '2'

eval
to_tsquery('english', 'a <-> (1 <-> 2)')
----
'1' <-> '2'

eval
to_tsquery('english', '1 <-> (a <-> 2)')
----
'1' <2> '2'

eval
to_tsquery('english', '1 <-> (2 <-> a)')
----
'1' <-> '2'

eval
to_tsquery('english', '(1 <-> 2) <3> a')
----
'1' <-> '2'

eval
to_tsquery('english', '(1 <-> a) <3> 2')
----
'1' <4> '2'

eval
to_tsquery('english', '(a <-> 1) <3> 2')
----
'1' <3> '2'

eval
to_tsquery('english', 'a <3> (1 <-> 2)')
----
'1' <-> '2'

eval
to_tsquery('english', '1 <3> (a <-> 2)')
----
'1' <4> '2'

eval
to_tsquery('english', '1 <3> (2 <-> a)')
----
'1' <3> '2'

eval
to_tsquery('english', '(1 <3> 2) <-> a')
----
'1' <3> '2'

eval
to_tsquery('english', '(1 <3> a) <-> 2')
----
'1' <4> '2'

eval
to_tsquery('english', '(a <3> 1) <-> 2')
----
'1' <-> '2'

eval
to_tsquery('english', 'a <-> (1 <3> 2)')
----
'1' <3> '2'

eval
to_tsquery('english', '1 <-> (a <3> 2)')
----
'1' <4> '2'

eval
to_tsquery('english', '1 <-> (2 <3> a)')
----
'1' <-> '2'

eval
to_tsquery('english', '((a <-> 1) <-> 2) <-> s')
----
'1' <-> '2'

eval
to_tsquery('english', '(2 <-> (a <-> 1)) <-> s')
----
'2' <2> '1'

eval
to_tsquery('english', '((1 <-> a) <-> 2) <-> s')
----
'1' <2> '2'

eval
to_tsquery('english', '(2 <-> (1 <-> a)) <-> s')
----
'2' <-> '1'

eval
to_tsquery('english', 's <-> ((a <-> 1) <-> 2)')
----
'1' <-> '2'

eval
to_tsquery('english', 's <-> (2 <-> (a <-> 1))')
----
'2' <2> '1'

eval
to_tsquery('english', 's <-> ((1 <-> a) <-> 2)')
----
'1' <2> '2'

eval
to_tsquery('english', 's <-> (2 <-> (1 <-> a))')
----
'2' <-> '1'

eval
to_tsquery('english', '((a <-> 1) <-> s) <-> 2')
----
'1' <2> '2'

eval
to_tsquery('english', '(s <-> (a <-> 1)) <-> 2')
----
'1' <-> '2'

eval
to_tsquery('english', '((1 <-> a) <-> s) <-> 2')
----
'1' <3> '2'

eval
to_tsquery('english', '(s <-> (1 <-> a)) <-> 2')
----
'1' <2> '2'

eval
to_tsquery('english', '2 <-> ((a <-> 1) <-> s)')
----
'2' <2> '1'

eval
to_tsquery('english', '2 <-> (s <-> (a <-> 1))')
----
'2' <3> '1'

eval
to_tsquery('english', '2 <-> ((1 <-> a) <-> s)')
----
'2' <-> '1'

eval
to_tsquery('english', '2 <-> (s <-> (1 <-> a))')
----
'2' <2> '1'


eval
to_tsquery('english', 'foo <-> (a <-> (the <-> bar))')
----
'foo' <3> 'bar'

eval
to_tsquery('english', '((foo <-> a) <-> the) <-> bar')
----
'foo' <3> 'bar'

eval
to_tsquery('english', 'foo <-> a <-> the <-> bar')
----
'foo' <3> 'bar'
34 changes: 34 additions & 0 deletions pkg/util/tsearch/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,28 @@ go_library(
"eval.go",
"lex.go",
"random.go",
"snowball.go",
"stopwords.go",
"tsquery.go",
"tsvector.go",
],
embedsrcs = [
"stopwords/danish.stop",
"stopwords/dutch.stop",
"stopwords/english.stop",
"stopwords/finnish.stop",
"stopwords/french.stop",
"stopwords/german.stop",
"stopwords/hungarian.stop",
"stopwords/italian.stop",
"stopwords/nepali.stop",
"stopwords/norwegian.stop",
"stopwords/portuguese.stop",
"stopwords/russian.stop",
"stopwords/spanish.stop",
"stopwords/swedish.stop",
"stopwords/turkish.stop",
],
importpath = "github.com/cockroachdb/cockroach/pkg/util/tsearch",
visibility = ["//visibility:public"],
deps = [
Expand All @@ -19,6 +38,21 @@ go_library(
"//pkg/sql/pgwire/pgcode",
"//pkg/sql/pgwire/pgerror",
"//pkg/util/encoding",
"@com_github_blevesearch_snowballstem//:snowballstem",
"@com_github_blevesearch_snowballstem//danish",
"@com_github_blevesearch_snowballstem//dutch",
"@com_github_blevesearch_snowballstem//english",
"@com_github_blevesearch_snowballstem//finnish",
"@com_github_blevesearch_snowballstem//french",
"@com_github_blevesearch_snowballstem//german",
"@com_github_blevesearch_snowballstem//hungarian",
"@com_github_blevesearch_snowballstem//italian",
"@com_github_blevesearch_snowballstem//norwegian",
"@com_github_blevesearch_snowballstem//portuguese",
"@com_github_blevesearch_snowballstem//russian",
"@com_github_blevesearch_snowballstem//spanish",
"@com_github_blevesearch_snowballstem//swedish",
"@com_github_blevesearch_snowballstem//turkish",
"@com_github_cockroachdb_errors//:errors",
],
)
Expand Down
Loading

0 comments on commit abdc96e

Please sign in to comment.