opensearch-project · andrross · Jan 3, 2024 · Oct 31, 2023 · Nov 1, 2023 · Nov 10, 2023
@@ -119,6 +119,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Create separate transport action for render search template action ([#11170](https://github.com/opensearch-project/OpenSearch/pull/11170))
 - Add additional handling in SearchTemplateRequest when simulate is set to true ([#11591](https://github.com/opensearch-project/OpenSearch/pull/11591))
 - Introduce cluster level setting `cluster.index.restrict.replication.type` to prevent replication type setting override during index creations([#11583](https://github.com/opensearch-project/OpenSearch/pull/11583))
+- Add match_only_text field that is optimized for storage by trading off positional queries performance ([#6836](https://github.com/opensearch-project/OpenSearch/pull/11039))
 
 ### Dependencies
 - Bumps jetty version to 9.4.52.v20230823 to fix GMS-2023-1857 ([#9822](https://github.com/opensearch-project/OpenSearch/pull/9822))

@@ -0,0 +1,70 @@
+# integration tests for queries with specific analysis chains
+
+"match query with stacked stems":
+  - skip:
+      version: " - 2.99.99"
+      reason: "match_only_text was added in 3.0"
+  # Tests the match query stemmed tokens are "stacked" on top of the unstemmed
+  # versions in the same position.
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 1
+            analysis:
+              analyzer:
+                index:
+                  tokenizer: standard
+                  filter: [lowercase]
+                search:
+                  rest_total_hits_as_int: true
+                  tokenizer: standard
+                  filter: [lowercase, keyword_repeat, porter_stem, unique_stem]
+              filter:
+                unique_stem:
+                  type: unique
+                  only_on_same_position: true
+          mappings:
+            properties:
+              text:
+                type: match_only_text
+                analyzer: index
+                search_analyzer: search
+
+  - do:
+      index:
+        index: test
+        id:    1
+        body:  { "text": "the fox runs across the street" }
+        refresh: true
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              text:
+                query: fox runs
+                operator: AND
+  - match: {hits.total: 1}
+
+  - do:
+      index:
+        index: test
+        id:    2
+        body:  { "text": "run fox run" }
+        refresh: true
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              text:
+                query: fox runs
+                operator: AND
+  - match: {hits.total: 2}
@@ -0,0 +1,144 @@
+"ngram search":
+  - skip:
+      version: " - 2.99.99"
+      reason: "match_only_text was added in 3.0"
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+            analysis:
+              analyzer:
+                my_analyzer:
+                  tokenizer: standard
+                  filter: [my_ngram]
+              filter:
+                my_ngram:
+                  type: ngram
+                  min: 2,
+                  max: 2
+          mappings:
+            properties:
+              text:
+                type: match_only_text
+                analyzer: my_analyzer
+
+  - do:
+      index:
+        index: test
+        id:    1
+        body:  { "text": "foo bar baz" }
+        refresh: true
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              text:
+                query: foa
+  - match: {hits.total: 1}
+
+---
+"testNGramCopyField":
+  - skip:
+      version: " - 2.99.99"
+      reason: "match_only_text was added in 3.0"
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+            max_ngram_diff: 9
+            analysis:
+              analyzer:
+                my_ngram_analyzer:
+                  tokenizer: my_ngram_tokenizer
+              tokenizer:
+                my_ngram_tokenizer:
+                  type: ngram
+                  min: 1,
+                  max: 10
+                  token_chars: []
+          mappings:
+            properties:
+              origin:
+                type: match_only_text
+                copy_to: meta
+              meta:
+                type: match_only_text
+                analyzer: my_ngram_analyzer
+
+  - do:
+      index:
+        index: test
+        id:    1
+        body:  { "origin": "C.A1234.5678" }
+        refresh: true
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              meta:
+                query: 1234
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              meta:
+                query: 1234.56
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              meta:
+                query: A1234
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            term:
+              meta:
+                value: a1234
+  - match: {hits.total: 0}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              meta:
+                query: A1234
+                analyzer: my_ngram_analyzer
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              meta:
+                query: a1234
+                analyzer: my_ngram_analyzer
+  - match: {hits.total: 1}
@@ -0,0 +1,137 @@
+"ngram highlighting":
+  - skip:
+      version: " - 2.99.99"
+      reason: "match_only_text was added in 3.0"
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+            index.max_ngram_diff: 19
+            analysis:
+              tokenizer:
+                my_ngramt:
+                  type: ngram
+                  min_gram: 1
+                  max_gram: 20
+                  token_chars: letter,digit
+              filter:
+                my_ngram:
+                  type: ngram
+                  min_gram: 1
+                  max_gram: 20
+              analyzer:
+                name2_index_analyzer:
+                  tokenizer: whitespace
+                  filter: [my_ngram]
+                name_index_analyzer:
+                  tokenizer: my_ngramt
+                name_search_analyzer:
+                  tokenizer: whitespace
+          mappings:
+            properties:
+              name:
+                type: match_only_text
+                term_vector: with_positions_offsets
+                analyzer: name_index_analyzer
+                search_analyzer: name_search_analyzer
+              name2:
+                type: match_only_text
+                term_vector: with_positions_offsets
+                analyzer: name2_index_analyzer
+                search_analyzer: name_search_analyzer
+
+  - do:
+      index:
+        index: test
+        id:    1
+        refresh: true
+        body:
+          name: logicacmg ehemals avinci - the know how company
+          name2: logicacmg ehemals avinci - the know how company
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name:
+                query: logica m
+          highlight:
+            fields:
+              - name: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name.0: "<em>logica</em>c<em>m</em>g ehe<em>m</em>als avinci - the know how co<em>m</em>pany"}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name:
+                query: logica ma
+          highlight:
+            fields:
+              - name: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name.0: "<em>logica</em>cmg ehe<em>ma</em>ls avinci - the know how company"}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name:
+                query: logica
+          highlight:
+            fields:
+              - name: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name.0: "<em>logica</em>cmg ehemals avinci - the know how company"}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name2:
+                query: logica m
+          highlight:
+            fields:
+              - name2: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> <em>ehemals</em> avinci - the know how <em>company</em>"}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name2:
+                query: logica ma
+          highlight:
+            fields:
+              - name2: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> <em>ehemals</em> avinci - the know how company"}
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        body:
+          query:
+            match:
+              name2:
+                query: logica
+          highlight:
+            fields:
+              - name2: {}
+  - match: {hits.total: 1}
+  - match: {hits.hits.0.highlight.name2.0: "<em>logicacmg</em> ehemals avinci - the know how company"}