openai · bu2kx · Jul 24, 2023 · Aug 25, 2023 · Aug 29, 2023 · Aug 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -138,4 +138,4 @@ dmypy.json
 .pyre/
 
 # macOS .DS_Store files
-.DS_Store
+.DS_Store
diff --git a/README b/README
@@ -0,0 +1 @@
+TBD
diff --git a/README.md b/README.md
@@ -119,6 +119,10 @@ Follow these steps to quickly set up and run the ChatGPT Retrieval Plugin:
    export MILVUS_USER=<your_milvus_username>
    export MILVUS_PASSWORD=<your_milvus_password>
 
+   # KDB.AI
+   export KDBAI_ENDPOINT=<KDB.AI_endpoint>
+   export KDBAI_API_KEY=<KDB.AI_API_key>
+
    # Qdrant
    export QDRANT_URL=<your_qdrant_url>
    export QDRANT_PORT=<your_qdrant_port>
@@ -388,6 +392,10 @@ For more detailed instructions on setting up and using each vector database prov
 
 [Milvus](https://milvus.io/) is an open-source, cloud-native vector database that scales to billions of vectors. It is the open-source version of Zilliz and shares many of its features, such as various indexing algorithms, distance metrics, scalar filtering, time travel searches, rollback with snapshots, multi-language SDKs, storage and compute separation, and cloud scalability. For detailed setup instructions, refer to [`/docs/providers/milvus/setup.md`](/docs/providers/milvus/setup.md).
 
+#### KDB.AI
+
+[KDB.AI](https://kdb.ai) is a powerful knowledge-based vector database and search engine that allows developers to build scalable, reliable and real-time applications by providing advanced search, recommendation and personalization for AI applications, using real-time data. For detailed setup instructions, refer to [`/docs/providers/kdbai/setup.md`](/docs/providers/kdbai/setup.md).
+
 #### Qdrant
 
 [Qdrant](https://qdrant.tech/) is a vector database capable of storing documents and vector embeddings. It offers both self-hosted and managed [Qdrant Cloud](https://cloud.qdrant.io/) deployment options, providing flexibility for users with different requirements. For detailed setup instructions, refer to [`/docs/providers/qdrant/setup.md`](/docs/providers/qdrant/setup.md).

diff --git a/datastore/factory.py b/datastore/factory.py
@@ -65,11 +65,15 @@ async def get_datastore() -> DataStore:
         case "elasticsearch":
             from datastore.providers.elasticsearch_datastore import (
                 ElasticsearchDataStore,
-            )
+            ) 
+            return ElasticsearchDataStore()    
+        case "kdbai":
+            from datastore.providers.kdbai_datastore import KDBAIDataStore
+
+            return KDBAIDataStore()
 
-            return ElasticsearchDataStore()
         case _:
             raise ValueError(
                 f"Unsupported vector database: {datastore}. "
-                f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, or qdrant"
+                f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, kdbai or qdrant"
             )
diff --git a/datastore/providers/kdbai_datastore.py b/datastore/providers/kdbai_datastore.py
@@ -0,0 +1,171 @@
+import os
+from typing import Dict, List, Optional
+
+from loguru import logger
+import pandas as pd
+
+from services.date import to_unix_timestamp
+from datastore.datastore import DataStore
+
+from models.models import (
+    DocumentChunk,
+    DocumentChunkWithScore,
+    DocumentMetadataFilter,
+    QueryResult,
+    QueryWithEmbedding,
+)
+
+try:
+    import pykx as kx
+    logger.info('PyKX version: ' + kx.__version__)
+
+except ImportError:
+    raise ValueError(
+        'Could not import pykx package.'
+        'Please add it to the dependencies.'
+    )
+
+try:
+    import kdbai_client as kdbai
+    logger.info('KDBAI client version: ' + kdbai.__version__)
+
+except ImportError:
+    raise ValueError(
+        'Could not import kdbai_client package.'
+        'Please add it to the dependencies.'
+    )
+
+
+KDBAI_ENDPOINT = os.environ.get('KDBAI_ENDPOINT', 'http://localhost:8082')
+KDBAI_API_KEY = os.environ.get('KDBAI_API_KEY', '')
+
+if KDBAI_API_KEY == '':
+    KDBAI_API_KEY = None
+
+DEFAULT_DIMS = 3072
+BATCH_SIZE = 100
+
+DEFAULT_SCHEMA = dict(
+    columns=[
+        dict(name='document_id', pytype='str'),
+        dict(name='text', pytype='bytes'),
+        dict(name='vecs', vectorIndex=dict(type='flat', metric='L2', dims=DEFAULT_DIMS)),
+    ])
+
+SCHEMA = os.environ.get('KDBAI_SCHEMA', DEFAULT_SCHEMA)
+TABLE = os.environ.get('KDBAI_TABLE', 'documents')
+
+
+class KDBAIDataStore(DataStore):
+
+    def __init__(self) -> None: 
+        try:
+            logger.info('Creating KDBAI datastore...')
+            self._session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)
+
+            if TABLE in self._session.list():
+                self._table = self._session.table(TABLE)
+            else:
+                self._table = self._session.create_table(TABLE, SCHEMA)
+
+        except Exception as e:
+            logger.error(f'Error while initializing KDBAI datastore: {e}.')
+            raise e
+
+
+    async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
+        """Upsert chunks into the datastore.
+
+        Args:
+            chunks (Dict[str, List[DocumentChunk]]): A list of DocumentChunks to insert
+
+        Raises:
+            e: Error in upserting data.
+
+        Returns:
+            List[str]: The document_id's that were inserted.
+        """
+        try: 
+            docs = []
+            for doc_id, chunk_list in chunks.items():
+                for chunk in chunk_list:
+                    docs.append(dict(
+                        document_id=doc_id,
+                        text=chunk.text,
+                        vecs=chunk.embedding,
+                    ))
+            df = pd.DataFrame(docs)
+
+            for i in range((len(df)-1)//BATCH_SIZE+1):
+                batch = df.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
+                try:
+                    self._table.insert(batch, warn=False)
+                except Exception as e:
+                    logger.exception('Failed to insert the batch of documents into the data store.')
+
+            return list(df['document_id'])
+
+        except Exception as e:
+            logger.exception(f'Failed to insert documents into the data store.')
+            return []
+
+
+    async def _query(
+        self,
+        queries: List[QueryWithEmbedding],
+    ) -> List[QueryResult]:
+        """Query
+
+        Search the embedding and its filter in the collection.
+
+        Args:
+            queries (List[QueryWithEmbedding]): The list of searches to perform.
+
+        Returns:
+            List[QueryResult]: Results for each search.
+        """
+        out = []
+        for query in queries:
+            try:
+                resdf = self._table.search(vectors=[query.embedding], n=query.top_k)[0]
+            except Exception as e:
+                logger.exception(f"Error while processing queries.")
+                raise e
+
+            docs = []
+            for result in resdf.to_dict(orient='records'):
+                docs.append(DocumentChunkWithScore(
+                    id=result['document_id'],
+                    text=result['text'],
+                    metadata=dict(),
+                    score=result['__nn_distance'],
+                ))
+            out.append(QueryResult(query=query.query, results=docs))
+
+        return out
+
+
+    async def delete(
+        self,
+        ids: Optional[List[str]] = None,
+        filter: Optional[DocumentMetadataFilter] = None,
+        delete_all: Optional[bool] = None,
+    ) -> bool:
+
+        """
+        Delete all vectors and assosiated index.
+        """
+        # Delete all vectors and assosiated index
+
+        try:
+           if delete_all:
+                self._table.drop()
+                logger.info(f"Deleted all vectors and index successfully")
+                return True
+           else:
+                logger.error("Functionality is not implemented yet")
+
+        except Exception as e:
+            logger.error("Failed to delete records, error: {}".format(e))
+            return []
+
diff --git a/docs/providers/kdbai/setup.md b/docs/providers/kdbai/setup.md
@@ -0,0 +1,16 @@
+# KDB.AI
+
+[KDB.AI](https://kdb.ai) is a powerful knowledge-based vector database and search engine that allows developers to build scalable, reliable and real-time applications by providing advanced search, recommendation and personalization for AI applications, using real-time data. You can register for Free Trial on https://kdb.ai.
+
+You can find a sample notebook to use the ChatGPT Retrieval Plugin backed by KDB.AI vector database [here](https://github.com/KxSystems/chatgpt-retrieval-plugin/blob/KDB.AI/examples/providers/kdbai/ChatGPT_QA_Demo.ipynb) and instructions to get started [here](https://code.kx.com/kdbai/integrations/openai.html).
+
+**Environment Variables:**
+
+| Name                | Required | Description                                                 | Default            |
+| ------------------- | -------- | ----------------------------------------------------------- | ------------------ |
+| `DATASTORE`         | Yes      | Datastore name, set to `kdbai`                              |                    |
+| `BEARER_TOKEN`      | Yes      | Secret token                                                |                    |
+| `OPENAI_API_KEY`    | Yes      | OpenAI API key                                              |                    |
+| `KDBAI_ENDPOINT`    | Yes      | KDB.AI endpoint                                             |                    |
+| `KDBAI_API_KEY`     | Yes      | KDB.AI API key                                              |                    |
+| `KDBAI_TABLE`       | Optional | TCP port for Qdrant GRPC communication                      | `documents`        |