From 88cc0a44176e6c7ea3cbb5f71925f0235eac12c2 Mon Sep 17 00:00:00 2001 From: Kalin Arsov Date: Fri, 3 Jan 2025 17:01:47 +0200 Subject: [PATCH 1/4] Create index on the node_id column for the MariaDB vector store integration --- .../llama_index/vector_stores/mariadb/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py index 532a7197087b1..d70c6a3a55a1c 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py @@ -186,6 +186,7 @@ def _create_table_if_not_exists(self) -> None: text TEXT, metadata JSON, embedding BLOB NOT NULL, + INDEX `{self.table_name}_node_id_idx` (`node_id`), VECTOR INDEX (embedding) ); """ From fd65100b6e1ed4f2ee6708a762d7b0591d4c780e Mon Sep 17 00:00:00 2001 From: Kalin Arsov Date: Fri, 3 Jan 2025 17:03:55 +0200 Subject: [PATCH 2/4] Support MariaDB 11.7 in the MariaDB vector store integration --- .../llama-index-vector-stores-mariadb/README.md | 10 ++++++---- .../llama_index/vector_stores/mariadb/base.py | 15 +++++++++------ .../pyproject.toml | 2 +- .../tests/docker-compose.yaml | 2 +- .../tests/test_mariadb.py | 1 + 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/README.md index 9b0be2972d336..c916ff8c6232b 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/README.md +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/README.md @@ -1,10 +1,12 @@ # LlamaIndex Vector_Stores Integration: MariaDB -With the release of MariaDB 11.6 Vector Preview, the MariaDB relational database introduced the long-awaited vector search functionality. +Starting with version `11.7.1`, the MariaDB relational database has vector search functionality integrated. Thus now it can be used as a fully-functional vector store in LlamaIndex. -Please note, however, that the latest MariaDB version is only an Alpha release, which means that it may crash unexpectedly. -To learn more about the feature, check the [Vector Overview](https://mariadb.com/kb/en/vector-overview/) in the MariaDB docs. +To learn more about the feature in MariaDB, check its [Vector Overview documentation](https://mariadb.com/kb/en/vector-overview/). + +Please note that versions before `0.3.0` of this package are not compatible with MariaDB 11.7 and later. +They are compatible only with the one-off `MariaDB 11.6 Vector` preview release which used a slightly different syntax. ## Installation @@ -33,7 +35,7 @@ vector_store = MariaDBVectorStore.from_params( ### Running Integration Tests A suite of integration tests is available to verify the MariaDB vector store integration. -The test suite needs a MariaDB database with vector search support up and running, if not found the tests are skipped. +The test suite needs a MariaDB database with vector search support up and running. If not found, the tests are skipped. To facilitate that, a sample `docker-compose.yaml` file is provided, so you can simply do: ```shell diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py index d70c6a3a55a1c..86dfb71bc27ba 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py @@ -179,16 +179,19 @@ def _connect(self) -> Any: def _create_table_if_not_exists(self) -> None: with self._engine.connect() as connection: + # Note that we define the vector index with DISTANCE=cosine, because we use VEC_DISTANCE_COSINE. + # This is because searches using a different distance function do not use the vector index. + # Reference: https://mariadb.com/kb/en/create-table-with-vectors/ stmt = f""" CREATE TABLE IF NOT EXISTS `{self.table_name}` ( id SERIAL PRIMARY KEY, node_id VARCHAR(255) NOT NULL, text TEXT, metadata JSON, - embedding BLOB NOT NULL, + embedding VECTOR({self.embed_dim}) NOT NULL, INDEX `{self.table_name}_node_id_idx` (`node_id`), - VECTOR INDEX (embedding) - ); + VECTOR INDEX (embedding) DISTANCE=cosine + ) """ connection.execute(sqlalchemy.text(stmt)) @@ -252,7 +255,7 @@ def add( VALUES ( :node_id, :text, - vec_fromtext(:embedding), + VEC_FromText(:embedding), :metadata ) """ @@ -368,7 +371,7 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul text, embedding, metadata, - vec_distance(embedding, vec_fromtext('{query.query_embedding}')) AS distance + VEC_DISTANCE_COSINE(embedding, vec_fromtext('{query.query_embedding}')) AS distance FROM `{self.table_name}` ORDER BY distance LIMIT {query.similarity_top_k} @@ -387,7 +390,7 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul text, embedding, metadata, - vec_distance(embedding, vec_fromtext('{query.query_embedding}')) AS distance + VEC_DISTANCE_COSINE(embedding, vec_fromtext('{query.query_embedding}')) AS distance FROM `{self.table_name}` WHERE {where} LIMIT 1000000 diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml index e2f05c3d49fdd..3f0c06259fa1e 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml @@ -27,7 +27,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-vector-stores-mariadb" readme = "README.md" -version = "0.2.0" +version = "0.3.0" [tool.poetry.dependencies] python = ">=3.9,<4.0" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/docker-compose.yaml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/docker-compose.yaml index db287a5b3f5dc..f52383ac40032 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/docker-compose.yaml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/docker-compose.yaml @@ -1,6 +1,6 @@ services: mariadb: - image: "quay.io/mariadb-foundation/mariadb-devel:11.6-vector-preview" + image: mariadb:11.7.1-rc environment: MARIADB_DATABASE: test MARIADB_ROOT_PASSWORD: test diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py index 4d337649126a6..cbcd24ee309d7 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py @@ -51,6 +51,7 @@ vector_store = MariaDBVectorStore.from_params( database="test", table_name="vector_store_test", + embed_dim=3, host="127.0.0.1", user="root", password="test", From 08010f6165b57e58014f11b88e001c4d14711992 Mon Sep 17 00:00:00 2001 From: Kalin Arsov Date: Fri, 3 Jan 2025 17:16:11 +0200 Subject: [PATCH 3/4] Simplify SQL for query with filters in the MariaDB vector store integration The complex query we used before is not needed now because the bug MDEV-34774 is fixed in MariaDB 11.7. --- .../llama_index/vector_stores/mariadb/base.py | 32 +++++-------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py index 86dfb71bc27ba..34b5f644a7823 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py @@ -371,33 +371,17 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul text, embedding, metadata, - VEC_DISTANCE_COSINE(embedding, vec_fromtext('{query.query_embedding}')) AS distance - FROM `{self.table_name}` - ORDER BY distance - LIMIT {query.similarity_top_k} - """ + VEC_DISTANCE_COSINE(embedding, VEC_FromText('{query.query_embedding}')) AS distance + FROM `{self.table_name}`""" if query.filters: - where = self._filters_to_where_clause(query.filters) + stmt += f""" + WHERE {self._filters_to_where_clause(query.filters)}""" - # We cannot use the query above when there is a WHERE clause, - # because of a bug in MariaDB: https://jira.mariadb.org/browse/MDEV-34774. - # The following query works around it. - stmt = f""" - SELECT * FROM ( - SELECT - node_id, - text, - embedding, - metadata, - VEC_DISTANCE_COSINE(embedding, vec_fromtext('{query.query_embedding}')) AS distance - FROM `{self.table_name}` - WHERE {where} - LIMIT 1000000 - ) AS unordered - ORDER BY distance - LIMIT {query.similarity_top_k} - """ + stmt += f""" + ORDER BY distance + LIMIT {query.similarity_top_k} + """ with self._engine.connect() as connection: result = connection.execute(sqlalchemy.text(stmt)) From db6ac6852af664400060215c64a3ce5c8bfe96e8 Mon Sep 17 00:00:00 2001 From: Kalin Arsov Date: Wed, 15 Jan 2025 17:30:18 +0200 Subject: [PATCH 4/4] Validate that the server version is supported on MariaDB vector store init --- .../llama_index/vector_stores/mariadb/base.py | 28 ++++++++++++++++++ .../pyproject.toml | 3 ++ .../tests/test_mariadb.py | 29 +++++++++++++++++-- 3 files changed, 58 insertions(+), 2 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py index 34b5f644a7823..5394ad1c9e1df 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/llama_index/vector_stores/mariadb/base.py @@ -177,6 +177,17 @@ def _connect(self) -> Any: self.connection_string, connect_args=self.connection_args, echo=self.debug ) + def _validate_server_version(self) -> None: + """Validate that the MariaDB server version is supported.""" + with self._engine.connect() as connection: + result = connection.execute(sqlalchemy.text("SELECT VERSION()")) + version = result.fetchone()[0] + + if not _meets_min_server_version(version, "11.7.1"): + raise ValueError( + f"MariaDB version 11.7.1 or later is required, found version: {version}." + ) + def _create_table_if_not_exists(self) -> None: with self._engine.connect() as connection: # Note that we define the vector index with DISTANCE=cosine, because we use VEC_DISTANCE_COSINE. @@ -201,6 +212,7 @@ def _initialize(self) -> None: if not self._is_initialized: self._connect() if self.perform_setup: + self._validate_server_version() self._create_table_if_not_exists() self._is_initialized = True @@ -431,3 +443,19 @@ def clear(self) -> None: connection.execute(sqlalchemy.text(stmt)) connection.commit() + + +def _meets_min_server_version(version: str, min_version: str) -> bool: + """Check if a MariaDB server version meets minimum required version. + + Args: + version: Version string from MariaDB server (e.g. "11.7.1-MariaDB-ubu2404") + min_version: Minimum required version string (e.g. "11.7.1") + + Returns: + bool: True if version >= min_version, False otherwise + """ + version = version.split("-")[0] + version_parts = [int(x) for x in version.split(".")] + min_version_parts = [int(x) for x in min_version.split(".")] + return version_parts >= min_version_parts diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml index 3f0c06259fa1e..94443f379bba9 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/pyproject.toml @@ -69,3 +69,6 @@ include = "llama_index/" filterwarnings = [ "ignore::DeprecationWarning:", ] +markers = [ + "noautousefixtures: marks tests that should not run fixtures with autouse", +] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py index cbcd24ee309d7..4dc9f1e2b1a8e 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-mariadb/tests/test_mariadb.py @@ -14,6 +14,7 @@ VectorStoreQuery, ) from llama_index.vector_stores.mariadb import MariaDBVectorStore +from llama_index.vector_stores.mariadb.base import _meets_min_server_version TEST_NODES: List[TextNode] = [ TextNode( @@ -76,21 +77,45 @@ @pytest.fixture(autouse=True) -def teardown() -> Generator: +def teardown(request: pytest.FixtureRequest) -> Generator: """Clear the store after a test completion.""" yield + if "noautousefixtures" in request.keywords: + return + vector_store.clear() @pytest.fixture(scope="session", autouse=True) -def close_db_connection() -> Generator: +def close_db_connection(request: pytest.FixtureRequest) -> Generator: """Close the DB connections after the last test.""" yield + if "noautousefixtures" in request.keywords: + return + vector_store.close() +@pytest.mark.parametrize( + ("version", "supported"), + [ + ("11.7.2-MariaDB-ubu2504", True), + ("11.7.1-MariaDB-ubu2404", True), + ("11.8.0", True), + ("12.0.0", True), + ("11.7.0", False), + ("11.6.0-MariaDB-ubu2404", False), + ("10.11.7-MariaDB-1:10.11.7+maria~ubu2204", False), + ("8.4.3", False), + ], +) +@pytest.mark.noautousefixtures() +def test_meets_min_server_version(version: str, supported: bool) -> None: + assert _meets_min_server_version(version, "11.7.1") == supported + + @pytest.mark.skipif( run_integration_tests is False, reason="MariaDB instance required for integration tests",