diff --git a/build-tools/build-infra/build.gradle b/build-tools/build-infra/build.gradle index 5cb1426cba97..34d71f7509d3 100644 --- a/build-tools/build-infra/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -22,6 +22,7 @@ plugins { } repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/globals.gradle b/gradle/globals.gradle index bcab6461ea91..25bfddc9bebf 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -22,6 +22,7 @@ allprojects { // Repositories to fetch dependencies from. repositories { + mavenLocal() mavenCentral() } diff --git a/lucene/licenses/commons-LICENSE-ASL.txt b/lucene/licenses/commons-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/commons-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/commons-NOTICE.txt b/lucene/licenses/commons-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/commons-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/commons-lang3-3.17.0.jar.sha1 b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 new file mode 100644 index 000000000000..f64174593b1c --- /dev/null +++ b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 @@ -0,0 +1 @@ +b17d2136f0460dcc0d2016ceefca8723bdf4ee70 diff --git a/lucene/licenses/cuvs-java-25.02.jar.sha1 b/lucene/licenses/cuvs-java-25.02.jar.sha1 new file mode 100644 index 000000000000..42b4dae43805 --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.jar.sha1 @@ -0,0 +1 @@ +870f2aed1a4633489cc9c3d33128683e668a0f30 diff --git a/lucene/licenses/cuvs-java-LICENSE-ASL.txt b/lucene/licenses/cuvs-java-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/cuvs-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/cuvs-java-NOTICE.txt b/lucene/licenses/cuvs-java-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/cuvs-java-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 72762fe1c3d2..6d225fd78ba4 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -19,9 +19,16 @@ apply plugin: 'java-library' description = 'Various third party contributions and new ideas' +repositories { + mavenLocal() +} + + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + moduleImplementation deps.commons.lang3 + moduleImplementation deps.cuvs } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..051c1df0a257 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,6 +20,9 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires java.logging; + requires com.nvidia.cuvs; + requires org.apache.commons.lang3; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -34,7 +37,12 @@ exports org.apache.lucene.sandbox.facet.iterators; exports org.apache.lucene.sandbox.facet.cutters; exports org.apache.lucene.sandbox.facet.labels; + exports org.apache.lucene.sandbox.vectorsearch; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + // provides org.apache.lucene.codecs.KnnVectorsFormat with + // org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; + provides org.apache.lucene.codecs.Codec with + org.apache.lucene.sandbox.vectorsearch.CuVSCodec; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java new file mode 100644 index 000000000000..6940b9bfeea6 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CagraFieldVectorsWriter.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.index.FieldInfo; + +/** + * CuVS based fields writer + */ +public class CagraFieldVectorsWriter extends KnnFieldVectorsWriter { + + public final String fieldName; + public final ConcurrentHashMap vectors = + new ConcurrentHashMap(); + public int fieldVectorDimension = -1; + + public CagraFieldVectorsWriter(FieldInfo fieldInfo) { + this.fieldName = fieldInfo.getName(); + this.fieldVectorDimension = fieldInfo.getVectorDimension(); + } + + @Override + public long ramBytesUsed() { + return fieldName.getBytes(Charset.forName("UTF-8")).length + + Integer.BYTES + + (vectors.size() * fieldVectorDimension * Float.BYTES); + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + vectors.put(docID, vectorValue); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java new file mode 100644 index 000000000000..1e3c85d746ef --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.LibraryNotFoundException; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** + * CuVS based codec for GPU based vector search + */ +public class CuVSCodec extends FilterCodec { + + public CuVSCodec() { + this("CuVSCodec", new Lucene101Codec()); + } + + public CuVSCodec(String name, Codec delegate) { + super(name, delegate); + KnnVectorsFormat format; + try { + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE); + setKnnFormat(format); + } catch (LibraryNotFoundException ex) { + Logger log = Logger.getLogger(CuVSCodec.class.getName()); + log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); + } + } + + KnnVectorsFormat knnFormat = null; + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return knnFormat; + } + + public void setKnnFormat(KnnVectorsFormat format) { + this.knnFormat = format; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java new file mode 100644 index 000000000000..6d2a4e281911 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import java.util.List; +import java.util.Objects; + +/** + * This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) + */ +public class CuVSIndex { + private final CagraIndex cagraIndex; + private final BruteForceIndex bruteforceIndex; + private final List mapping; + private final List vectors; + private final int maxDocs; + + private final String fieldName; + private final String segmentName; + + public CuVSIndex( + String segmentName, + String fieldName, + CagraIndex cagraIndex, + List mapping, + List vectors, + int maxDocs, + BruteForceIndex bruteforceIndex) { + this.cagraIndex = Objects.requireNonNull(cagraIndex); + this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); + this.mapping = Objects.requireNonNull(mapping); + this.vectors = Objects.requireNonNull(vectors); + this.fieldName = Objects.requireNonNull(fieldName); + this.segmentName = Objects.requireNonNull(segmentName); + this.maxDocs = Objects.requireNonNull(maxDocs); + } + + public CagraIndex getCagraIndex() { + return cagraIndex; + } + + public BruteForceIndex getBruteforceIndex() { + return bruteforceIndex; + } + + public List getMapping() { + return mapping; + } + + public String getFieldName() { + return fieldName; + } + + public List getVectors() { + return vectors; + } + + public String getSegmentName() { + return segmentName; + } + + public int getMaxDocs() { + return maxDocs; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java new file mode 100644 index 000000000000..e4ce49fb84f7 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.util.Bits; + +/** + * Query for CuVS + */ +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { + + private final int iTopK; + private final int searchWidth; + + public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { + super(field, target, k); + this.iTopK = iTopK; + this.searchWidth = searchWidth; + } + + @Override + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { + + PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); + + LeafReader reader = context.reader(); + reader.searchNearestVectors(field, this.getTargetCopy(), results, null); + return results.topDocs(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java new file mode 100644 index 000000000000..e6be4726f16e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** + * Methods to deal with a CuVS composite file inside a segment + */ +public class CuVSSegmentFile implements AutoCloseable { + private final ZipOutputStream zos; + + private Set filesAdded = new HashSet(); + + public CuVSSegmentFile(OutputStream out) { + zos = new ZipOutputStream(out); + zos.setLevel(Deflater.NO_COMPRESSION); + } + + protected Logger log = Logger.getLogger(getClass().getName()); + + public void addFile(String name, byte[] bytes) throws IOException { + /*log.info( + "Writing the file: " + + name + + ", size=" + + bytes.length);*/ + ZipEntry indexFileZipEntry = new ZipEntry(name); + zos.putNextEntry(indexFileZipEntry); + zos.write(bytes, 0, bytes.length); + zos.closeEntry(); + filesAdded.add(name); + } + + public Set getFilesAdded() { + return Collections.unmodifiableSet(filesAdded); + } + + @Override + public void close() throws IOException { + zos.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java new file mode 100644 index 000000000000..e3928a31b050 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.LibraryNotFoundException; +import java.io.IOException; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** + * CuVS based KnnVectorsFormat for GPU acceleration + */ +public class CuVSVectorsFormat extends KnnVectorsFormat { + + public static final String VECTOR_DATA_CODEC_NAME = "Lucene99CagraVectorsFormatData"; + public static final String VECTOR_DATA_EXTENSION = "cag"; + public static final String META_EXTENSION = "cagmf"; + public static final int VERSION_CURRENT = 0; + public final int maxDimensions = 4096; + public final int cuvsWriterThreads; + public final int intGraphDegree; + public final int graphDegree; + public MergeStrategy mergeStrategy; + public static CuVSResources resources; + + public CuVSVectorsFormat() { + super("CuVSVectorsFormat"); + this.cuvsWriterThreads = 1; + this.intGraphDegree = 128; + this.graphDegree = 64; + try { + resources = new CuVSResources(); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + public CuVSVectorsFormat( + int cuvsWriterThreads, int intGraphDegree, int graphDegree, MergeStrategy mergeStrategy) + throws LibraryNotFoundException { + super("CuVSVectorsFormat"); + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + try { + resources = new CuVSResources(); + } catch (LibraryNotFoundException ex) { + throw ex; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new CuVSVectorsWriter( + state, cuvsWriterThreads, intGraphDegree, graphDegree, mergeStrategy, resources); + } + + @Override + public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + try { + return new CuVSVectorsReader(state, resources); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java new file mode 100644 index 000000000000..b41e5c08f177 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.lang.StackWalker.StackFrame; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +/** + * KnnVectorsReader instance associated with CuVS format + */ +public class CuVSVectorsReader extends KnnVectorsReader { + + // protected Logger log = Logger.getLogger(getClass().getName()); + + IndexInput vectorDataReader = null; + public String fileName = null; + public byte[] indexFileBytes; + public int[] docIds; + public float[] vectors; + public SegmentReadState segmentState = null; + public int indexFilePayloadSize = 0; + public long initialFilePointerLoc = 0; + public SegmentInputStream segmentInputStream; + + // Field to List of Indexes + public Map> cuvsIndexes; + + private CuVSResources resources; + + public CuVSVectorsReader(SegmentReadState state, CuVSResources resources) throws Throwable { + + segmentState = state; + this.resources = resources; + + fileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + + vectorDataReader = segmentState.directory.openInput(fileName, segmentState.context); + CodecUtil.readIndexHeader(vectorDataReader); + + initialFilePointerLoc = vectorDataReader.getFilePointer(); + indexFilePayloadSize = + (int) vectorDataReader.length() + - (int) initialFilePointerLoc; // vectorMetaReader.readInt(); + segmentInputStream = + new SegmentInputStream(vectorDataReader, indexFilePayloadSize, initialFilePointerLoc); + // log.info("payloadSize: " + indexFilePayloadSize); + // log.info("initialFilePointerLoc: " + initialFilePointerLoc); + + List stackTrace = StackWalker.getInstance().walk(this::getStackTrace); + + boolean isMergeCase = false; + for (StackFrame s : stackTrace) { + if (s.toString().startsWith("org.apache.lucene.index.IndexWriter.merge")) { + isMergeCase = true; + // log.info("Reader opening on merge call"); + break; + } + } + + /*log.info( + "Source of this segment " + + segmentState.segmentSuffix + + " is " + + segmentState.segmentInfo.getDiagnostics().get(IndexWriter.SOURCE)); + log.info("Loading for " + segmentState.segmentInfo.name + ", mergeCase? " + isMergeCase); + log.info("Not the merge case, hence loading for " + segmentState.segmentInfo.name);*/ + this.cuvsIndexes = loadCuVSIndex(getIndexInputStream(), isMergeCase); + } + + @SuppressWarnings({"unchecked"}) + private Map> loadCuVSIndex(ZipInputStream zis, boolean isMergeCase) + throws Throwable { + Map> ret = new HashMap>(); + Map cagraIndexes = new HashMap(); + Map bruteforceIndexes = new HashMap(); + Map hnswIndexes = new HashMap(); + Map> mappings = new HashMap>(); + Map> vectors = new HashMap>(); + + Map maxDocs = null; // map of segment, maxDocs + ZipEntry ze; + while ((ze = zis.getNextEntry()) != null) { + String entry = ze.getName(); + + String segmentField = entry.split("\\.")[0]; + String extension = entry.split("\\.")[1]; + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int len = 0; + while ((len = zis.read(buffer)) != -1) { + baos.write(buffer, 0, len); + } + + switch (extension) { + case "meta": + { + maxDocs = (Map) SerializationUtils.deserialize(baos.toByteArray()); + break; + } + case "vec": + { + vectors.put( + segmentField, (List) SerializationUtils.deserialize(baos.toByteArray())); + break; + } + case "map": + { + List map = (List) SerializationUtils.deserialize(baos.toByteArray()); + mappings.put(segmentField, map); + break; + } + case "cag": + { + cagraIndexes.put( + segmentField, + new CagraIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "bf": + { + bruteforceIndexes.put( + segmentField, + new BruteForceIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .build()); + break; + } + case "hnsw": + { + HnswIndexParams indexParams = new HnswIndexParams.Builder(resources).build(); + hnswIndexes.put( + segmentField, + new HnswIndex.Builder(resources) + .from(new ByteArrayInputStream(baos.toByteArray())) + .withIndexParams(indexParams) + .build()); + break; + } + } + } + + /*log.info("Loading cuvsIndexes from segment: " + segmentState.segmentInfo.name); + log.info("Diagnostics for this segment: " + segmentState.segmentInfo.getDiagnostics()); + log.info("Loading map of cagraIndexes: " + cagraIndexes); + log.info("Loading vectors: " + vectors); + log.info("Loading mapping: " + mappings);*/ + + for (String segmentField : cagraIndexes.keySet()) { + // log.info("Loading segmentField: " + segmentField); + String segment = segmentField.split("/")[0]; + String field = segmentField.split("/")[1]; + CuVSIndex cuvsIndex = + new CuVSIndex( + segment, + field, + cagraIndexes.get(segmentField), + mappings.get(segmentField), + vectors.get(segmentField), + maxDocs.get(segment), + bruteforceIndexes.get(segmentField)); + List listOfIndexes = + ret.containsKey(field) ? ret.get(field) : new ArrayList(); + listOfIndexes.add(cuvsIndex); + ret.put(field, listOfIndexes); + } + return ret; + } + + public List getStackTrace(Stream stackFrameStream) { + return stackFrameStream.collect(Collectors.toList()); + } + + public ZipInputStream getIndexInputStream() throws IOException { + segmentInputStream.reset(); + return new ZipInputStream(segmentInputStream); + } + + @Override + public void close() throws IOException { + IOUtils.close(vectorDataReader); + } + + @Override + public void checkIntegrity() throws IOException { + // TODO: Pending implementation + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + return new FloatVectorValues() { + + @Override + public int size() { + return cuvsIndexes.get(field).get(0).getVectors().size(); + } + + @Override + public int dimension() { + return cuvsIndexes.get(field).get(0).getVectors().get(0).length; + } + + @Override + public float[] vectorValue(int pos) throws IOException { + return cuvsIndexes.get(field).get(0).getVectors().get(pos); + } + + @Override + public FloatVectorValues copy() throws IOException { + return null; + } + }; + } + + @Override + public ByteVectorValues getByteVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + PerLeafCuVSKnnCollector cuvsCollector = + knnCollector instanceof PerLeafCuVSKnnCollector + ? ((PerLeafCuVSKnnCollector) knnCollector) + : new PerLeafCuVSKnnCollector(knnCollector.k(), knnCollector.k(), 1); + TopKnnCollector defaultCollector = + knnCollector instanceof TopKnnCollector ? ((TopKnnCollector) knnCollector) : null; + + int prevDocCount = 0; + + // log.debug("Will try to search all the indexes for segment "+segmentState.segmentInfo.name+", + // field "+field+": "+cuvsIndexes); + for (CuVSIndex cuvsIndex : cuvsIndexes.get(field)) { + try { + Map result = new HashMap(); + if (cuvsCollector.k() <= 1024) { + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(cuvsCollector.iTopK) + .withSearchWidth(cuvsCollector.searchWidth) + .build(); + + CagraQuery query = + new CagraQuery.Builder() + .withTopK(cuvsCollector.k()) + .withSearchParams(searchParams) + .withMapping(cuvsIndex.getMapping()) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + assert (cagraIndex != null); + // log.info("k is " + cuvsCollector.k()); + result = + cagraIndex + .search(query) + .getResults() + .get(0); // List expected to have only one entry because of single query "target". + // log.info("INTERMEDIATE RESULT FROM CUVS: " + result + ", prevDocCount=" + + // prevDocCount); + } else { + BruteForceQuery bruteforceQuery = + new BruteForceQuery.Builder() + .withQueryVectors(new float[][] {target}) + .withPrefilter(((FixedBitSet) acceptDocs).getBits()) + .withTopK(cuvsCollector.k()) + .build(); + + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + result = bruteforceIndex.search(bruteforceQuery).getResults().get(0); + } + + for (Entry kv : result.entrySet()) { + if (defaultCollector != null) { + defaultCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + cuvsCollector.collect(prevDocCount + kv.getKey(), kv.getValue()); + } + + } catch (Throwable e) { + throw new RuntimeException(e); + } + prevDocCount += cuvsIndex.getMaxDocs(); + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java new file mode 100644 index 000000000000..bb40b7119a0e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import org.apache.commons.lang3.SerializationUtils; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.SuppressForbidden; + +/** + * KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU + */ +public class CuVSVectorsWriter extends KnnVectorsWriter { + + // protected Logger log = Logger.getLogger(getClass().getName()); + + private List fieldVectorWriters = new ArrayList<>(); + private IndexOutput cuVSIndex = null; + private SegmentWriteState segmentWriteState = null; + private String cuVSDataFilename = null; + + private CagraIndex cagraIndex; + private CagraIndex cagraIndexForHnsw; + + private int cuvsWriterThreads; + private int intGraphDegree; + private int graphDegree; + private MergeStrategy mergeStrategy; + private CuVSResources resources; + + /** + * Merge strategy used for CuVS + */ + public enum MergeStrategy { + TRIVIAL_MERGE, + NON_TRIVIAL_MERGE + }; + + public CuVSVectorsWriter( + SegmentWriteState state, + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + CuVSResources resources) + throws IOException { + super(); + this.segmentWriteState = state; + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.resources = resources; + + cuVSDataFilename = + IndexFileNames.segmentFileName( + this.segmentWriteState.segmentInfo.name, + this.segmentWriteState.segmentSuffix, + CuVSVectorsFormat.VECTOR_DATA_EXTENSION); + } + + @Override + public long ramBytesUsed() { + return 0; + } + + @Override + public void close() throws IOException { + IOUtils.close(cuVSIndex); + cuVSIndex = null; + fieldVectorWriters.clear(); + fieldVectorWriters = null; + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + CagraFieldVectorsWriter cagraFieldVectorWriter = new CagraFieldVectorsWriter(fieldInfo); + fieldVectorWriters.add(cagraFieldVectorWriter); + return cagraFieldVectorWriter; + } + + @SuppressForbidden(reason = "A temporary java.util.File is needed for Cagra's serialization") + private byte[] createCagraIndex(float[][] vectors, List mapping) throws Throwable { + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndex = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = + File.createTempFile( + "tmpindex", "cag"); // TODO: Should we make this a file with random names? + cagraIndex.serialize(baos, tmpFile); + return baos.toByteArray(); + } + + @SuppressForbidden(reason = "A temporary java.util.File is needed for BruteForce's serialization") + private byte[] createBruteForceIndex(float[][] vectors) throws Throwable { + BruteForceIndexParams indexParams = + new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + BruteForceIndex index = + new BruteForceIndex.Builder(resources) + .withIndexParams(indexParams) + .withDataset(vectors) + .build(); + + // log.info("Indexing done: " + System.currentTimeMillis()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + index.serialize(baos); + return baos.toByteArray(); + } + + @SuppressForbidden(reason = "A temporary java.util.File is needed for HNSW's serialization") + private byte[] createHnswIndex(float[][] vectors) throws Throwable { + CagraIndexParams indexParams = + new CagraIndexParams.Builder(resources) + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(intGraphDegree) + .withGraphDegree(graphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + + // log.info("Indexing started: " + System.currentTimeMillis()); + cagraIndexForHnsw = + new CagraIndex.Builder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + // log.info("Indexing done: " + System.currentTimeMillis() + "ms, documents: " + + // vectors.length); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + File tmpFile = File.createTempFile("tmpindex", "hnsw"); + cagraIndexForHnsw.serializeToHNSW(baos, tmpFile); + return baos.toByteArray(); + } + + @SuppressWarnings({"resource", "rawtypes", "unchecked"}) + @Override + public void flush(int maxDoc, DocMap sortMap) throws IOException { + cuVSIndex = + this.segmentWriteState.directory.createOutput( + cuVSDataFilename, this.segmentWriteState.context); + CodecUtil.writeIndexHeader( + cuVSIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + this.segmentWriteState.segmentInfo.getId(), + this.segmentWriteState.segmentSuffix); + + CuVSSegmentFile cuVSFile = new CuVSSegmentFile(new SegmentOutputStream(cuVSIndex, 100000)); + + LinkedHashMap metaMap = new LinkedHashMap(); + + for (CagraFieldVectorsWriter field : fieldVectorWriters) { + // long start = System.currentTimeMillis(); + + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + // log.info("Starting CAGRA indexing, space remaining: " + new File("/").getFreeSpace()); + // log.info("Starting CAGRA indexing, docs: " + field.vectors.size()); + + float vectors[][] = new float[field.vectors.size()][field.vectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = field.vectors.get(i)[j]; + } + } + + cagraIndexBytes = createCagraIndex(vectors, new ArrayList(field.vectors.keySet())); + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + throw new RuntimeException(e); + } + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".cag", cagraIndexBytes); + // log.info( + // "time for writing CAGRA index bytes to zip: " + (System.currentTimeMillis() - start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".bf", bruteForceIndexBytes); + /*log.info( + "time for writing BRUTEFORCE index bytes to zip: " + + (System.currentTimeMillis() - start));*/ + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".hnsw", hnswIndexBytes); + // log.info("time for writing HNSW index bytes to zip: " + (System.currentTimeMillis() - + // start)); + + // start = System.currentTimeMillis(); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".vec", + SerializationUtils.serialize(new ArrayList(field.vectors.values()))); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + "/" + field.fieldName + ".map", + SerializationUtils.serialize(new ArrayList(field.vectors.keySet()))); + // log.info("list serializing and writing: " + (System.currentTimeMillis() - start)); + field.vectors.clear(); + } + + metaMap.put(segmentWriteState.segmentInfo.name, maxDoc); + cuVSFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + cuVSFile.close(); + + CodecUtil.writeFooter(cuVSIndex); + } + + SegmentOutputStream mergeOutputStream = null; + CuVSSegmentFile mergedIndexFile = null; + + @SuppressWarnings("resource") + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + List segInputStreams = new ArrayList(); + List readers = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + segInputStreams.add(reader.segmentInputStream); + readers.add(reader); + } + + // log.info("Merging one field for segment: " + segmentWriteState.segmentInfo.name); + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (!List.of(segmentWriteState.directory.listAll()).contains(cuVSDataFilename)) { + IndexOutput mergedVectorIndex = + segmentWriteState.directory.createOutput(cuVSDataFilename, segmentWriteState.context); + CodecUtil.writeIndexHeader( + mergedVectorIndex, + CuVSVectorsFormat.VECTOR_DATA_CODEC_NAME, + CuVSVectorsFormat.VERSION_CURRENT, + segmentWriteState.segmentInfo.getId(), + segmentWriteState.segmentSuffix); + this.mergeOutputStream = new SegmentOutputStream(mergedVectorIndex, 100000); + mergedIndexFile = new CuVSSegmentFile(this.mergeOutputStream); + } + + // log.info("Segment files? " + Arrays.toString(segmentWriteState.directory.listAll())); + + if (mergeStrategy.equals(MergeStrategy.TRIVIAL_MERGE)) { + throw new UnsupportedOperationException(); + } else if (mergeStrategy.equals(MergeStrategy.NON_TRIVIAL_MERGE)) { + // log.info("Readers: " + segInputStreams.size() + ", deocMaps: " + + // mergeState.docMaps.length); + ArrayList docMapList = new ArrayList(); + + for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) { + // CuVSVectorsReader reader = (CuVSVectorsReader) mergeState.knnVectorsReaders[i]; + // for (CuVSIndex index : reader.cuvsIndexes.get(fieldInfo.name)) { + // log.info("Mapping for segment (" + reader.fileName + "): " + index.getMapping()); + // log.info("Mapping for segment (" + reader.fileName + "): " + + // index.getMapping().size()); + for (int id = 0; id < mergeState.maxDocs[i]; id++) { + docMapList.add(mergeState.docMaps[i].get(id)); + } + // log.info("DocMaps for segment (" + reader.fileName + "): " + docMapList); + // } + } + + ArrayList mergedVectors = + Util.getMergedVectors( + segInputStreams, fieldInfo.name, segmentWriteState.segmentInfo.name); + // log.info("Final mapping: " + docMapList); + // log.info("Final mapping: " + docMapList.size()); + // log.info("Merged vectors: " + mergedVectors.size()); + LinkedHashMap metaMap = new LinkedHashMap(); + byte[] cagraIndexBytes = null; + byte[] bruteForceIndexBytes = null; + byte[] hnswIndexBytes = null; + try { + float vectors[][] = new float[mergedVectors.size()][mergedVectors.get(0).length]; + for (int i = 0; i < vectors.length; i++) { + for (int j = 0; j < vectors[i].length; j++) { + vectors[i][j] = mergedVectors.get(i)[j]; + } + } + cagraIndexBytes = createCagraIndex(vectors, new ArrayList()); + bruteForceIndexBytes = createBruteForceIndex(vectors); + hnswIndexBytes = createHnswIndex(vectors); + } catch (Throwable e) { + throw new RuntimeException(e); + } + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".cag", cagraIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".bf", + bruteForceIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".hnsw", hnswIndexBytes); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".vec", + SerializationUtils.serialize(mergedVectors)); + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + "/" + fieldInfo.getName() + ".map", + SerializationUtils.serialize(docMapList)); + metaMap.put(segmentWriteState.segmentInfo.name, mergedVectors.size()); + if (mergedIndexFile.getFilesAdded().contains(segmentWriteState.segmentInfo.name + ".meta") + == false) { + mergedIndexFile.addFile( + segmentWriteState.segmentInfo.name + ".meta", SerializationUtils.serialize(metaMap)); + } + // log.info("DocMaps: " + Arrays.toString(mergeState.docMaps)); + + metaMap.clear(); + } + } + + @Override + public void finish() throws IOException { + if (this.mergeOutputStream != null) { + mergedIndexFile.close(); + CodecUtil.writeFooter(mergeOutputStream.out); + IOUtils.close(mergeOutputStream.out); + this.mergeOutputStream = null; + this.mergedIndexFile = null; + } + } + + /** + * OutputStream for writing into an IndexOutput + */ + public class SegmentOutputStream extends OutputStream { + + IndexOutput out; + int bufferSize; + byte[] buffer; + int p; + + public SegmentOutputStream(IndexOutput out, int bufferSize) throws IOException { + super(); + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[this.bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[p] = (byte) b; + p += 1; + if (p == bufferSize) { + flush(); + } + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, p); + p = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java new file mode 100644 index 000000000000..a1473c4acf20 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; + +/** + * KnnCollector for CuVS + */ +public class PerLeafCuVSKnnCollector implements KnnCollector { + + public List scoreDocs; + public int topK = 0; + public int iTopK = topK; // TODO getter, no setter + public int searchWidth = 1; // TODO getter, no setter + public int results = 0; + + public PerLeafCuVSKnnCollector(int topK, int iTopK, int searchWidth) { + super(); + this.topK = topK; + this.iTopK = iTopK; + this.searchWidth = searchWidth; + scoreDocs = new ArrayList(); + } + + @Override + public boolean earlyTerminated() { + // TODO: may need implementation + return false; + } + + @Override + public void incVisitedCount(int count) { + // TODO: may need implementation + } + + @Override + public long visitedCount() { + // TODO: may need implementation + return 0; + } + + @Override + public long visitLimit() { + // TODO: may need implementation + return 0; + } + + @Override + public int k() { + return topK; + } + + @Override + @SuppressWarnings("cast") + public boolean collect(int docId, float similarity) { + scoreDocs.add(new ScoreDoc(docId, 1f / (float) (similarity))); + return true; + } + + @Override + public float minCompetitiveSimilarity() { + // TODO: may need implementation + return 0; + } + + @Override + public TopDocs topDocs() { + return new TopDocs( + new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java new file mode 100644 index 000000000000..47c6d3c3cedf --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/SegmentInputStream.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.IndexInput; + +/** + * InputStream semantics for reading from an IndexInput + */ +public class SegmentInputStream extends InputStream { + + /** */ + private final IndexInput indexInput; + + public final long initialFilePointerPosition; + public final long limit; + public long pos = 0; + + // TODO: This input stream needs to be modified to enable buffering. + public SegmentInputStream(IndexInput indexInput, long limit, long initialFilePointerPosition) + throws IOException { + super(); + this.indexInput = indexInput; + this.initialFilePointerPosition = initialFilePointerPosition; + this.limit = limit; + + this.indexInput.seek(initialFilePointerPosition); + } + + @Override + public int read() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int read(byte[] b, int off, int len) { + try { + long avail = limit - pos; + if (pos >= limit) { + return -1; + } + if (len > avail) { + len = (int) avail; + } + if (len <= 0) { + return 0; + } + indexInput.readBytes(b, off, len); + pos += len; + return len; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public int read(byte[] b) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void reset() throws IOException { + indexInput.seek(initialFilePointerPosition); + pos = 0; + } + + @Override + public long skip(long n) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public boolean markSupported() { + return true; + } + + @Override + public void mark(int readlimit) { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + // Do nothing for now. + } + + @Override + public int available() { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java new file mode 100644 index 000000000000..dfe60b29ea27 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/Util.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import org.apache.commons.lang3.SerializationUtils; + +/** + * Some Utils used in CuVS integration + */ +public class Util { + + public static ByteArrayOutputStream getZipEntryBAOS( + String fileName, SegmentInputStream segInputStream) throws IOException { + segInputStream.reset(); + ZipInputStream zipInputStream = new ZipInputStream(segInputStream); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + boolean fileFound = false; + ZipEntry zipEntry; + while (zipInputStream.available() == 1 + && ((zipEntry = zipInputStream.getNextEntry()) != null)) { + if (zipEntry.getName().equals(fileName)) { + fileFound = true; + byte[] buffer = new byte[1024]; + int length; + while ((length = zipInputStream.read(buffer)) != -1) { + baos.write(buffer, 0, length); + } + } + } + if (!fileFound) throw new FileNotFoundException(); + return baos; + } + + // private static final Logger log = Logger.getLogger(Util.class.getName()); + + public static ArrayList getMergedVectors( + List segInputStreams, String fieldName, String mergedSegmentName) + throws IOException { + ZipEntry zs; + ArrayList mergedVectors = new ArrayList(); + // log.info("Getting mergedVectors..."); + for (SegmentInputStream segInputStream : segInputStreams) { + segInputStream.reset(); + ZipInputStream zipStream = new ZipInputStream(segInputStream); + while ((zs = zipStream.getNextEntry()) != null) { + // log.info("Getting mergedVectors... " + zs.getName()); + byte[] buffer = new byte[1024]; + int length; + if (zs.getName().endsWith(".vec")) { + String field = zs.getName().split("\\.")[0].split("/")[1]; + if (fieldName.equals(field)) { + ByteArrayOutputStream baosM = new ByteArrayOutputStream(); + while ((length = zipStream.read(buffer)) != -1) { + baosM.write(buffer, 0, length); + } + List m = SerializationUtils.deserialize(baosM.toByteArray()); + mergedVectors.addAll(m); + } + } + } + } + return mergedVectors; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java new file mode 100644 index 000000000000..a11c94e7224b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * CuVS based fast vector search + */ +package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec new file mode 100644 index 000000000000..6f0a89e365d1 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSCodec \ No newline at end of file diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..666ee726f986 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java new file mode 100644 index 000000000000..70325a3aa294 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +@SuppressSysoutChecks(bugUrl = "prints info from within cuvs") +public class TestCuVS extends LuceneTestCase { + + protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); + + private static IndexSearcher searcher; + private static IndexReader reader; + private static Directory directory; + + public static int DATASET_SIZE_LIMIT = 1000; + public static int DIMENSIONS_LIMIT = 2048; + public static int NUM_QUERIES_LIMIT = 10; + public static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 + + public static float[][] dataset = null; + + @BeforeClass + public static void beforeClass() throws Exception { + directory = newDirectory(); + + Codec codec = new CuVSCodec(); + + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setCodec(codec) + .setMergePolicy(newTieredMergePolicy())); + + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); + + Random random = random(); + int datasetSize = random.nextInt(DATASET_SIZE_LIMIT) + 1; + int dimensions = random.nextInt(DIMENSIONS_LIMIT) + 1; + dataset = generateDataset(random, datasetSize, dimensions); + for (int i = 0; i < datasetSize; i++) { + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); + doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); + boolean skipVector = + random.nextInt(10) < 0; // disable testing with holes for now, there's some bug. + if (!skipVector + || datasetSize < 100) { // about 10th of the documents shouldn't have a single vector + doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + } + + writer.addDocument(doc); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + reader.close(); + directory.close(); + searcher = null; + reader = null; + directory = null; + log.info("Test finished"); + } + + @Test + public void testVectorSearch() throws IOException { + Random random = random(); + int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; + int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); + + if (dataset.length < topK) topK = dataset.length; + + float[][] queries = generateQueries(random, dataset[0].length, numQueries); + List> expected = generateExpectedResults(topK, dataset, queries); + + log.info("Dataset size: " + dataset.length + "x" + dataset[0].length); + log.info("Query size: " + numQueries + "x" + queries[0].length); + log.info("TopK: " + topK); + + Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + int correct[] = new int[topK]; + for (int i = 0; i < topK; i++) correct[i] = expected.get(0).get(i); + + ScoreDoc[] hits = searcher.search(query, topK).scoreDocs; + log.info("RESULTS: " + Arrays.toString(hits)); + log.info("EXPECTD: " + expected.get(0)); + + for (ScoreDoc hit : hits) { + log.info("\t" + reader.storedFields().document(hit.doc).get("id") + ": " + hit.score); + } + + for (ScoreDoc hit : hits) { + int doc = Integer.parseInt(reader.storedFields().document(hit.doc).get("id")); + assertTrue("Result returned was not in topk*2: " + doc, expected.get(0).contains(doc)); + } + } + + private static float[][] generateQueries(Random random, int dimensions, int numQueries) { + // Generate random query vectors + float[][] queries = new float[numQueries][dimensions]; + for (int i = 0; i < numQueries; i++) { + for (int j = 0; j < dimensions; j++) { + queries[i][j] = random.nextFloat() * 100; + } + } + return queries; + } + + private static float[][] generateDataset(Random random, int datasetSize, int dimensions) { + // Generate a random dataset + float[][] dataset = new float[datasetSize][dimensions]; + for (int i = 0; i < datasetSize; i++) { + for (int j = 0; j < dimensions; j++) { + dataset[i][j] = random.nextFloat() * 100; + } + } + return dataset; + } + + private static List> generateExpectedResults( + int topK, float[][] dataset, float[][] queries) { + List> neighborsResult = new ArrayList<>(); + int dimensions = dataset[0].length; + + for (float[] query : queries) { + Map distances = new TreeMap<>(); + for (int j = 0; j < dataset.length; j++) { + double distance = 0; + for (int k = 0; k < dimensions; k++) { + distance += (query[k] - dataset[j][k]) * (query[k] - dataset[j][k]); + } + distances.put(j, (distance)); + } + + Map sorted = new TreeMap(distances); + log.info("EXPECTED: " + sorted); + + // Sort by distance and select the topK nearest neighbors + List neighbors = + distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add( + neighbors.subList( + 0, + Math.min( + topK * 3, + dataset.length))); // generate double the topK results in the expected array + } + + log.info("Expected results generated successfully."); + return neighborsResult; + } +} diff --git a/versions.lock b/versions.lock index 26de44f99e2d..dfa465a1b3fe 100644 --- a/versions.lock +++ b/versions.lock @@ -4,6 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -11,6 +12,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "0129b4f0,refs=6", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", @@ -46,6 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", + "com.nvidia.cuvs:cuvs-java:25.02" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.13" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", @@ -55,6 +58,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "7ac6f8d9,refs=9", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", @@ -79,6 +83,32 @@ } }, "because" : { + "0129b4f0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -405,6 +435,44 @@ "projectPath" : ":lucene:analysis:morfologik" } ], + "7ac6f8d9" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "85a1e4c6" : [ { "configuration" : "compileClasspath", diff --git a/versions.toml b/versions.toml index 80dc51f39bf2..d0db5fd20d9d 100644 --- a/versions.toml +++ b/versions.toml @@ -4,6 +4,8 @@ asm = "9.6" assertj = "3.21.0" commons-codec = "1.13" commons-compress = "1.19" +commons-lang3 = "3.17.0" +cuvs = "25.02" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" @@ -42,6 +44,8 @@ asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +commons-lang3 = { module = "org.apache.commons:commons-lang3", version.ref = "commons-lang3" } +cuvs = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs" } ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" }