From 804a5f2b71d4b487159314ec77e0971cd7b7716f Mon Sep 17 00:00:00 2001 From: tryptofanik <49122854+tryptofanik@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:59:05 +0100 Subject: [PATCH] New data indexing docs page (#8013) GitOrigin-RevId: d4af088f50bb592678fe5fd782139c1b938495da --- .../40.first_realtime_app_with_pathway.md | 2 +- .../.vectorstore_pipeline/article.py | 361 ------------------ .../sample_documents/repo_readme.md | 5 - .../4.user-guide/50.llm-xpack/10.overview.md | 9 +- .../50.llm-xpack/30.docs-indexing.md | 218 +++++++++++ .../50.llm-xpack/30.vectorstore_pipeline.md | 1 - .../7.templates/.adaptive-rag/article.py | 2 +- python/pathway/io/csv/__init__.py | 13 +- python/pathway/io/fs/__init__.py | 13 +- 9 files changed, 236 insertions(+), 388 deletions(-) delete mode 100644 docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/article.py delete mode 100644 docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md create mode 100644 docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md delete mode 120000 docs/2.developers/4.user-guide/50.llm-xpack/30.vectorstore_pipeline.md diff --git a/docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md b/docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md index 87674dbb..c5fb1d6a 100644 --- a/docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md +++ b/docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md @@ -208,7 +208,7 @@ Keep in mind that some output connectors to external data storage system might t title: "Live Data AI Pipelines" --- #default - - [Data indexing pipeline and RAG.](/developers/user-guide/llm-xpack/vectorstore_pipeline) + - [Data indexing pipeline and RAG.](/developers/user-guide/llm-xpack/docs-indexing) - [Multimodal RAG.](/developers/templates/multimodal-rag) - [Unstructured data to SQL on-the-fly.](/developers/templates/unstructured-to-structured) :: diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/article.py b/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/article.py deleted file mode 100644 index 559572d8..00000000 --- a/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/article.py +++ /dev/null @@ -1,361 +0,0 @@ -# --- -# title: "Data Indexing" -# description: '' -# aside: true -# author: 'pathway' -# article: -# date: '2023-12-15' -# thumbnail: '' -# tags: ['showcase', 'llm', 'data-pipeline'] -# keywords: ['LLM', 'RAG', 'GPT', 'OpenAI', 'Google Docs', 'KNN', 'Vector store', 'langchain', 'llama-index', 'vectordb', 'vectore store langchain', 'retriever', 'unstructured'] -# notebook_export_path: notebooks/showcases/live_vector_indexing_pipeline.ipynb -# jupyter: -# jupytext: -# formats: py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.16.1 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# # Always Up-to-date Data Indexing pipeline -# -# This showcase demonstrates how to use Pathway to deploy a live data indexing pipeline that can be queried similarly to a typical vector store. Unlike traditional approaches, Pathway updates the index with every data change, ensuring consistently up-to-date answers. -# -# ::article-img -# --- -# src: '/assets/content/showcases/vectorstore/vectorstore_doc.png' -# alt: 'Pathway data indexing pipeline' -# class: 'mx-auto' -# zoomable: true -# --- -# :: -# -# Pathway Vectorstore enables building a document index on top of your documents without the -# complexity of ETL pipelines, managing different containers for storing, embedding, and serving. -# It allows for easy to manage, always up-to-date, LLM pipelines accesible using a RESTful API -# and with integrations to popular LLM toolkits such as Langchain and LlamaIndex. -# -# -# In this article, we will use a simple document processing pipeline that: -# 1. Monitors several data sources (files, S3 folders, cloud storages) for data changes. -# 2. Parses, splits and embeds the documents. -# 3. Builds a vector index for the data. -# -# However, If you prefer not to create the pipeline from the ground up and would like to check out the functionality, -# take a look at our [`managed pipelines`](https://cloud.pathway.com/docindex) in action. -# -# We will connect to the index using a `VectorStore` client, which allows retrieval of semantically similar documents. - -# %% [markdown] -# ## Prerequisites -# -# Install the `pathway` package. You can also install the `unstructured` package to use the most powerful `unstructured.io`-based parser. -# -# Then download sample data. - -# %% -# _MD_SHOW_!pip install "pathway[xpack-llm,xpack-llm-docs]" -# _MD_SHOW_ !pip install unstructured[all-docs] -# _MD_SHOW_!mkdir -p sample_documents -# _MD_SHOW_![ -f sample_documents/repo_readme.md ] || wget 'https://gist.githubusercontent.com/janchorowski/dd22a293f3d99d1b726eedc7d46d2fc0/raw/pathway_readme.md' -O 'sample_documents/repo_readme.md' - -# _MD_COMMENT_START_ -if 1: # group to prevent isort messing up - import json - import os - - from common.shadows import fs - - os.environ["OPENAI_API_KEY"] = json.loads( - fs.open("vault://kv.v2:deployments@/legal_rag_demo").read() - )["OPENAI_KEY"] -# _MD_COMMENT_END_ - -# %% -import logging -import sys -import time - -logging.basicConfig(stream=sys.stderr, level=logging.WARN, force=True) - -# %% [markdown] -# ## Building the data pipeline -# -# First, make sure you have an API key with an LLM provider such as OpenAI. - -# %% -import getpass -import os - -if "OPENAI_API_KEY" not in os.environ: - os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:") - -# %% [markdown] -# We will now assemble the data vectorization pipeline, using a simple `UTF8` file parser, a character splitter and an embedder from the [Pathway LLM xpack](/developers/user-guide/llm-xpack/overview). -# -# First, we define the data sources. We use the files-based one for simplicity, but any supported `pathway` [connector](/developers/api-docs/pathway-io/), such as [s3](/developers/api-docs/pathway-io/s3/) or [Google Drive](/developers/api-docs/pathway-io/gdrive#pathway.io.gdrive.read) will also work. -# -# Then, we define the embedder and splitter. -# -# Last, we assemble the data pipeline. We will start it running in a background thread to be able to query it immediately from the demonstration. Please note that in a production deployment, the server will run in another process, possibly on another machine. For the quick-start, we keep the server and client as different threads of the same Python process. - -# %% -import pathway as pw - -# This creates a connector that tracks files in a given directory. -data_sources = [] -data_sources.append( - pw.io.fs.read( - "./sample_documents", - format="binary", - mode="streaming", - with_metadata=True, - ) -) - -# This creates a connector that tracks files in Google Drive. -# Please follow the instructions at /developers/user-guide/connect/connectors/gdrive-connector/ to get credentials. -# data_sources.append( -# pw.io.gdrive.read(object_id="17H4YpBOAKQzEJ93xmC2z170l0bP2npMy", service_user_credentials_file="credentials.json", with_metadata=True)) - -# %% -# We now build the VectorStore pipeline - -from pathway.xpacks.llm.embedders import OpenAIEmbedder -from pathway.xpacks.llm.splitters import TokenCountSplitter -from pathway.xpacks.llm.vector_store import VectorStoreClient, VectorStoreServer - -PATHWAY_PORT = 8765 - -# Choose document transformers -text_splitter = TokenCountSplitter() -embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]) - -# The `PathwayVectorServer` is a wrapper over `pathway.xpacks.llm.vector_store` to accept LangChain transformers. -# Fell free to fork it to develop bespoke document processing pipelines. -vector_server = VectorStoreServer( - *data_sources, - embedder=embedder, - splitter=text_splitter, -) -# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT, threaded=True, with_cache=False) -# _MD_SHOW_time.sleep(30) # Workaround for Colab - messages from threads are not visible unless a cell is running - -# %% [markdown] -# We now instantiate and configure the client - -# %% -client = VectorStoreClient( - host="127.0.0.1", - port=PATHWAY_PORT, -) - -# %% [markdown] -# And we can start asking queries - -# %% -query = "What is Pathway?" -# _MD_SHOW_docs = client(query) -# _MD_SHOW_docs - - -# %% [markdown] -# **Your turn!** Now make a change to the source documents or make a fresh one and retry the query! - -# %% [markdown] -# ## Integrations -# -# ### Langchain -# -# You can use a Pathway Vector Store in LangChain pipelines with `PathwayVectorClient` -# and configure a `VectorStoreServer` using LangChain components. For more information see [our article](/blog/langchain-integration) or [LangChain documentation](https://python.langchain.com/v0.1/docs/integrations/vectorstores/pathway/). -# - -# %% -# _MD_SHOW_!pip install langchain -# _MD_SHOW_!pip install langchain-openai -# _MD_SHOW_!pip install langchain-community - -# %% [markdown] -# ```python -# from langchain_community.vectorstores import PathwayVectorClient -# -# # PathwayVectorClient implements regular VectorStore API of LangChain -# client = PathwayVectorClient(host="127.0.0.1", port=PATHWAY_PORT) -# docs = client.similarity_search("What is Pathway?") -# ``` - -# %% -# Here we show how to configure a server that uses LangChain document processing components - -# _MD_SHOW_from langchain_openai import OpenAIEmbeddings -# _MD_SHOW_from langchain.text_splitter import CharacterTextSplitter - -# Choose proper LangChain document transformers -# _MD_SHOW_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) -# _MD_SHOW_embeddings_model = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]) - -# Use VectorStoreServer.from_langchain_components to create a vector server using LangChain -# document processors -# _MD_SHOW_vector_server = VectorStoreServer.from_langchain_components( -# _MD_SHOW_ *data_sources, -# _MD_SHOW_ embedder=embeddings_model, -# _MD_SHOW_ splitter=text_splitter, -# _MD_SHOW_) -# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT+1, threaded=True, with_cache=False) -# _MD_SHOW_time.sleep(30) # colab workaround - -# %% -# You can connect to the Pathway+LangChain server using any client - Pathway's, Langchain's or LlamaIndex's! -# _MD_SHOW_client = VectorStoreClient( -# _MD_SHOW_ host="127.0.0.1", -# _MD_SHOW_ port=PATHWAY_PORT+1, -# _MD_SHOW_) - -# _MD_SHOW_client.query("pathway") - -# %% [markdown] -# ### LlamaIndex -# -# Pathway is fully integrated with LlamaIndex! We show below how to instantiate a Llama-Index -# retriever that queries the Pathway VectorStoreServer -# and how to configure a server using LlamaIndex components. -# -# For more information see `Pathway Retriever` -# [cookbook](https://docs.llamaindex.ai/en/stable/examples/retrievers/pathway_retriever.html). -# %% -# _MD_SHOW_!pip install llama-index llama-index-retrievers-pathway llama-index-embeddings-openai - -# %% -# You can connect to the PathwayVectorStore using a llama-index compatible retriever -# _MD_SHOW_from llama_index.retrievers.pathway import PathwayRetriever - -# PathwayRetriever implements the Retriever interface -# _MD_SHOW_pr = PathwayRetriever(host="127.0.0.1", port=PATHWAY_PORT) -# _MD_SHOW_pr.retrieve(str_or_query_bundle="What is Pathway?") - -# %% -# Here we show how to configure a server that uses LlamaIndex document processing components - -# _MD_SHOW_from llama_index.embeddings.openai import OpenAIEmbedding -# _MD_SHOW_from llama_index.core.node_parser import TokenTextSplitter - -# Choose proper LlamaIndex document transformers -# _MD_SHOW_embed_model = OpenAIEmbedding(embed_batch_size=10) - -# _MD_SHOW_transformations_example = [ -# _MD_SHOW_ TokenTextSplitter( -# _MD_SHOW_ chunk_size=150, -# _MD_SHOW_ chunk_overlap=10, -# _MD_SHOW_ separator=" ", -# _MD_SHOW_ ), -# _MD_SHOW_ embed_model, -# _MD_SHOW_] - -# Use VectorStoreServer.from_llamaindex_components to create a vector server using LlamaIndex -# document processors -# _MD_SHOW_vector_server = VectorStoreServer.from_llamaindex_components( -# _MD_SHOW_ *data_sources, -# _MD_SHOW_ transformations=transformations_example, -# _MD_SHOW_) -# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT+2, threaded=True, with_cache=False) -# _MD_SHOW_time.sleep(30) # colab workaround - -# %% -# You can connect to the Pathway+LlamaIndex server using any client - Pathway's, Langchain's or LlamaIndex's! -# _MD_SHOW_client = VectorStoreClient( -# _MD_SHOW_ host="127.0.0.1", -# _MD_SHOW_ port=PATHWAY_PORT+2, -# _MD_SHOW_) - -# _MD_SHOW_client.query("pathway") - -# %% [markdown] -# ## Advanced topics -# -# ### Getting information on indexed files - -# %% [markdown] -# [`PathwayVectorClient.get_vectorstore_statistics()`](/developers/api-docs/pathway-xpacks-llm/vectorstore#pathway.xpacks.llm.vector_store.VectorStoreClient.get_vectorstore_statistics) gives essential statistics on the state of the vector store, like the number of indexed files and the timestamp of the last updated one. You can use it in your chains to tell the user how fresh your knowledge base is. - -# %% -# _MD_SHOW_client.get_vectorstore_statistics() - -# %% [markdown] -# You can also use [`PathwayVectorClient.get_input_files()`](/developers/api-docs/pathway-xpacks-llm/vectorstore#pathway.xpacks.llm.vector_store.VectorStoreClient.get_input_files) to get the list of indexed files along with the associated metadata. - -# %% -# _MD_SHOW_client.get_input_files() - -# %% [markdown] -# ### Filtering based on file metadata -# -# We support document filtering using [jmespath](https://jmespath.org/) expressions, for instance: - -# %% -# take into account only sources modified later than unix timestamp -# _MD_SHOW_docs = client(query, metadata_filter="modified_at >= `1702672093`") - -# take into account only sources modified later than unix timestamp -# _MD_SHOW_docs = client(query, metadata_filter="owner == `james`") - -# take into account only sources with path containing 'repo_readme' -# _MD_SHOW_docs = client(query, metadata_filter="contains(path, 'repo_readme')") - -# and of two conditions -# _MD_SHOW_docs = client(query, metadata_filter="owner == `james` && modified_at >= `1702672093`") - -# or of two conditions -# _MD_SHOW_docs = client(query, metadata_filter="owner == `james` || modified_at >= `1702672093`") - -# %% [markdown] -# ### Configuring the parser -# -# The vectorization pipeline supports pluggable parsers. If not provided, defaults to `UTF-8` parser. You can find available parsers [here](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/parsers.py). -# An example parser that can read PDFs, Word documents and other formats is provided with `parsers.ParseUnstructured`: - -# %% [markdown] -# ```python -# from pathway.xpacks.llm import parsers -# -# vector_server = VectorStoreServer( -# *data_sources, -# parser=parsers.ParseUnstructured(), -# embedder=embeddings_model, -# splitter=text_splitter, -# ) -# ``` - -# %% [markdown] -# ### Configuring the cache -# -# The Pathway vectorizing pipeline comes with an embeddings cache: -# ```python -# vector_server.run_server(..., with_cache=True) -# ``` -# -# The default cache configuration is the locally hosted disk cache, stored in the `./Cache` directory. However, it can be customized by explicitly specifying the caching backend chosen among several persistent backend [options](/developers/api-docs/persistence-api#pathway.persistence.Backend). - - -# %% [markdown] -# ### Running in production -# -# A production deployment will typically run the server in a separate process. We recommend running the Pathway data indexing pipeline in a container-based deployment environment like Docker or Kubernetes. For more info, see [Pathway's deployment guide](/developers/user-guide/deployment/docker-deployment/). -# -# ::shoutout-banner -# --- -# href: "https://discord.gg/pathway" -# icon: "ic:baseline-discord" -# --- -# #title -# Discuss tricks & tips for RAG -# #description -# Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation -# :: diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md b/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md deleted file mode 100644 index c73ce844..00000000 --- a/docs/2.developers/4.user-guide/50.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md +++ /dev/null @@ -1,5 +0,0 @@ -Pathway is an open framework for high-throughput and low-latency real-time data processing. It is used to create Python code which seamlessly combines batch processing, streaming, and real-time API's for LLM apps. Pathway's distributed runtime (šŸ¦€-šŸ) provides fresh results of your data pipelines whenever new inputs and requests are received. - - -In the first place, Pathway was designed to be a life-saver (or at least a time-saver) for Python developers and ML/AI engineers faced with live data sources, where you need to react quickly to fresh data. Still, Pathway is a powerful tool that can be used for a lot of things. If you want to do streaming in Python, build an AI data pipeline, or if you are looking for your next Python data processing framework, keep reading. - diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md b/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md index fbc19adc..f40e5352 100644 --- a/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md +++ b/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md @@ -193,16 +193,15 @@ texts = documents.select(chunk=splitter(pw.this.text)) `TokenCountSplitter` returns data in the same format as `ParseUnstructured` - that is for each row it returns a list of tuples, where each tuple consists of a string with the text of a chunk and a dictionary with associated metadata. -With these tools it is easy to create in Pathway a pipeline serving as a Vector Store, but which updates on each data change. You can check such an example in [the llm-app repository](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/demo-question-answering/app.py). As it is a common pipeline, Pathway provides a [class `VectorStore`](/developers/api-docs/pathway-xpacks-llm/vectorstore#pathway.xpacks.llm.vector_store.VectorStoreServer) which implements this pipeline. -## Ready-to-use Vector Store +## Ready-to-use Document Store -Pathway Vector Store enables building a document index on top of your documents and allows for easy-to-manage, always up-to-date LLM pipelines accessible using a RESTful API. It maintains an index of your documents and allows for querying for documents closest to a given query. It is implemented using two classes - [`VectorStoreServer`](/developers/api-docs/pathway-xpacks-llm/vectorstore#pathway.xpacks.llm.vector_store.VectorStoreServer) and [`VectorStoreClient`](/developers/api-docs/pathway-xpacks-llm/vectorstore#pathway.xpacks.llm.vector_store.VectorStoreClient). +With these tools it is easy to create in Pathway a pipeline serving as a [`DocumentStore`](/developers/api-docs/pathway-xpacks-llm/document_store), which automatically indexes documents and gets updated upon new data. -The `VectorStoreServer` class implements the pipeline for indexing your documents and runs an HTTP REST server for nearest neighbors queries. You can use `VectorStoreServer` by itself to use Pathway as a Vector Store, and you then query it using REST. Alternatively, use `VectorStoreClient` for querying `VectorStoreServer` which implements wrappers for REST calls. +To make interaction with DocumentStore easier you can also use [`DocumentStoreServer`](/developers/api-docs/pathway-xpacks-llm/servers#pathway.xpacks.llm.servers.DocumentStoreServer) that handles API calls. -You can learn more about Vector Store in Pathway in a [dedicated tutorial](/developers/user-guide/llm-xpack/vectorstore_pipeline). +You can learn more about Document Store in Pathway in a [dedicated tutorial](/developers/user-guide/llm-xpack/docs-indexing) and check out a QA app example in [the llm-app repository](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/demo-question-answering/app.py). ### Integrating with LlamaIndex and LangChain diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md b/docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md new file mode 100644 index 00000000..f388bdf6 --- /dev/null +++ b/docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md @@ -0,0 +1,218 @@ +--- +title: "Document Indexing" +description: "Introduction to the Pathway LLM xpack" +date: "2025-01-13" +thumbnail: "" +tags: ["tutorial", "indexing", "document storage", "retrieval"] +keywords: ["LLM", "GPT", "OpenAI", "Gemini"] +--- + +# Document Indexing + +Document indexing organizes and categorizes documents to enable efficient search and retrieval. By creating an **index**ā€”a structured representation of the document's contentā€”you can quickly access information based on search queries. In the context of large language models (LLMs) like GPT, indexing enhances their ability to generate relevant responses by organizing a knowledge repository. + +## Connecting to Documents + +Use the Pathway [**connector**](/developers/user-guide/connect/connectors-in-pathway) to connect to your documents: + +```python +import pathway as pw + +data_sources = pw.io.fs.read( + "./sample_docs", + format="binary", + with_metadata=True, +) +``` + +We use the binary format because parsers require bytes as input, and the document store will use the metadata for filtering. + +## Chunking + +Large documents, like books, can be inefficient to process in a single prompt due to their size. Instead, break the document into smaller, more manageable chunks that are easier to search and require fewer tokens in API calls. + +A simple approach might involve slicing the text every n characters. However, this can split sentences or phrases awkwardly, resulting in incomplete or distorted chunks. Additionally, token counts vary (a token might be a character, word, or punctuation), making it hard to manage consistent chunk sizes with character-based splitting. + +A better method is to chunk the text by tokens, ensuring each chunk makes sense and aligns with sentence or paragraph boundaries. Token-based chunking is typically done at logical breakpoints, such as periods, commas, or newlines. + +Pathway offers a `TokenCountSplitter` for token-based chunking. Hereā€™s how to use it: + +```python +from pathway.xpacks.llm.splitters import TokenCountSplitter + +text_splitter = TokenCountSplitter( + min_tokens=100, + max_tokens=500, + encoding_name="cl100k_base" +) +``` + +This configuration creates chunks of 100ā€“500 tokens using the `cl100k_base` tokenizer, compatible with OpenAIā€™s `text-embedding-ada-002` model. + +For more on token encodings, refer to [OpenAI's tiktoken guide](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken#encodings). + +## Embedding + +Embedding transforms text into fixed-size vectors for indexing and retrieval. +It is required only when using vector indices, such as approximate nearest neighbor (ANN) search. +Traditional indices like BM25 (e.g., [TantivyBM25](/developers/api-docs/indexing#pathway.stdlib.indexing.TantivyBM25)) do not require embedding. +Pathway provides several embedding models, including: + +- [`OpenAIEmbedder`](/developers/api-docs/pathway#pathway.xpacks.llm.embedders.OpenAIEmbedder) +- [`LiteLLMEmbedder`](/developers/api-docs/pathway#pathway.xpacks.llm.embedders.LiteLLMEmbedder) +- [`GeminiEmbedder`](/developers/api-docs/pathway#pathway.xpacks.llm.embedders.GeminiEmbedder) +- [`SentenceTransformerEmbedder`](/developers/api-docs/pathway#pathway.xpacks.llm.embedders.SentenceTransformerEmbedder) + +Example: + +```python +from pathway.xpacks.llm.embedders import OpenAIEmbedder + +embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]) +``` + +The default model for `OpenAIEmbedder` is `text-embedding-ada-002`. + +## Retriever + +The retriever creates an index to find relevant documents for a query. You can use the `BruteForceKnnFactory` to set up the retriever: + +```python +from pathway.stdlib.indexing.nearest_neighbors import BruteForceKnnFactory + +retriever_factory = BruteForceKnnFactory( + embedder=embedder, +) +``` + +Pathway comes with several indexing engine factories: +- [BruteForceKnnFactory](/developers/api-docs/indexing#pathway.stdlib.indexing.BruteForceKnnFactory) +- [UsearchKnnFactory](/developers/api-docs/indexing#pathway.stdlib.indexing.UsearchKnnFactory) +- [TantivyBM25Factory](/developers/api-docs/indexing#pathway.stdlib.indexing.TantivyBM25Factory) +- [HybridIndexFactory](/developers/api-docs/indexing#pathway.stdlib.indexing.HybridIndexFactory) + + + + + + +## Assembling the Document Store + +With all components ready, construct the [`DocumentStore`](/developers/api-docs/pathway#pathway.xpacks.llm.document_store.DocumentStore). +This object will handle processing of documents (which include **parsing**, **post-processing** and **splitting**) and then building an **index** (retriever) out of them. + +```python +from pathway.xpacks.llm.document_store import DocumentStore + +store = DocumentStore( + docs=data_sources, + retriever_factory=retriever_factory, + splitter=text_splitter, +) +``` + + +## Preparing Queries + +Save queries in a CSV file with the following columns: + +1. `query`: Your question +2. `k`: Number of documents to retrieve +3. `metadata_filter` (optional): Filter files by metadata +4. `filepath_globpattern` (optional): Narrow files by path pattern + +Example: + +``` +printf "query,k,metadata_filter,filepath_globpattern\n\"Who is Regina Phalange?\",3,,\n" > queries.csv +``` + +Let's connect to the CSV: + +```python +query = pw.io.fs.read( + "queries.csv", + format="csv", + # predefined schema for query table + schema=DocumentStore.RetrieveQuerySchema +) +``` + +## Retrieval + +Now you can simply run `retrieve_query` function on your store object and see which document chunks might contain useful information for answering your query. + +```python +result = store.retrieve_query(query) +``` + + +## Advanced topics + +### Interacting with Document Store via REST Server + +Pathway's REST server allows you to expose a `DocumentStore` as a service that can be accessed via API requests. This is useful when integrating the `DocumentStore` into a larger system, especially if it needs to be accessed from an external process. + +```python +from pathway.xpacks.llm.servers import DocumentStoreServer + +PATHWAY_PORT = 8765 +server = DocumentStoreServer( + host="127.0.0.1", + port=PATHWAY_PORT, + document_store=store, +) +server.run(threaded=True, with_cache=False) +``` + +Once the server is running you can send a request to the API: + +```bash +curl -X POST http://localhost:8765/v1/retrieve \ + -H "Content-Type: application/json" \ + -d '{ + "query": "Who is Regina Phalange?", + "k": 2 + }' +``` + +### Filtering files + +`DocumentStore` allows you to narrow down search of relevant documents based on files metadata or their paths. +There are two fields in query that one can use in order to facilitate this functionality (these were also mentioned above in [Preparing queries](/developers/user-guide/llm-xpack/docs-indexing#preparing-queries) subsection): +- `metadata_filter` (optional): Filter files by [jmespath metadata](https://jmespath.org/) such as `modified_at`, `owner`, `contains` +- `filepath_globpattern` (optional): Narrow files by glob path pattern + +Example: + +```bash +printf 'query,k,metadata_filter,filepath_globpattern\n"Who is Regina Phalange?",3,owner==`albert`,**/phoebe*\n' > queries.csv +``` + +| query | k | metadata_filter | filepath_globpattern | +|---------------------------|---|-------------------|----------------------| +| "Who is Regina Phalange?" | 3 | owner==\`albert\` | \*\*/phoebe* | + + +```python +query = pw.io.fs.read( + "queries.csv", + format="csv", + schema=DocumentStore.RetrieveQuerySchema +) + +result = store.retrieve_query(query) +``` + +The available metadata fields depend on the type of connector you are using. You can find the extracted metadata fields by referring to the API documentation of the connector's `read` function, specifically the `with_metadata` parameter. +For example, in [CSV connector](https://pathway.com/developers/api-docs/pathway-io/csv#pathway.io.csv.read) if you set `with_metadata=True` you will have access to `created_at`, `modified_at`, `owner`, `size`, `path`, `seen_at` metadata fields that you can use for filtering. + + +#### Finding documents + +You can also use `inputs_query` function to search your documents based only on glob pattern and metadata without involving retrieval. You only need to provide a Pathway table with only two columns (`metadata_filter` and `filepath_globpattern`). It should follow the `DocumentStore.InputsQuerySchema`. + + + + + diff --git a/docs/2.developers/4.user-guide/50.llm-xpack/30.vectorstore_pipeline.md b/docs/2.developers/4.user-guide/50.llm-xpack/30.vectorstore_pipeline.md deleted file mode 120000 index 7aa379aa..00000000 --- a/docs/2.developers/4.user-guide/50.llm-xpack/30.vectorstore_pipeline.md +++ /dev/null @@ -1 +0,0 @@ -.vectorstore_pipeline/article.md \ No newline at end of file diff --git a/docs/2.developers/7.templates/.adaptive-rag/article.py b/docs/2.developers/7.templates/.adaptive-rag/article.py index cc17826e..2bd19536 100644 --- a/docs/2.developers/7.templates/.adaptive-rag/article.py +++ b/docs/2.developers/7.templates/.adaptive-rag/article.py @@ -171,7 +171,7 @@ # ## Trying it out # -# We provide an implementation of the Adaptive RAG in the [Pathway](https://pathway.com/developers) data processing framework. Pathway is your one-stop-shop for building realtime data processing pipelines, from simple ETL to synchronizing and indexing document collections into knowledge bases. The Pathway [LLM Xpack](/developers/user-guide/llm-xpack/overview/) is a set of pipeline components that are useful in working with LLMs: [auto-updating vector stores](/developers/user-guide/llm-xpack/vectorstore_pipeline/), [RAGs](/solutions/rag-pipelines) and [many more LLM examples](https://github.com/pathwaycom/llm-app). +# We provide an implementation of the Adaptive RAG in the [Pathway](https://pathway.com/developers) data processing framework. Pathway is your one-stop-shop for building realtime data processing pipelines, from simple ETL to synchronizing and indexing document collections into knowledge bases. The Pathway [LLM Xpack](/developers/user-guide/llm-xpack/overview/) is a set of pipeline components that are useful in working with LLMs: [auto-updating vector stores](/developers/user-guide/llm-xpack/docs-indexing/), [RAGs](/solutions/rag-pipelines) and [many more LLM examples](https://github.com/pathwaycom/llm-app). # If you are interested in how Adaptive RAG is implemented inside Pathway, you can [dive into the internals directly here](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/question_answering.py#L37). # As a prerequisite to run the code, install necessary packages and download sample data which will be used. diff --git a/python/pathway/io/csv/__init__.py b/python/pathway/io/csv/__init__.py index 3ecb880a..7a7a2ae5 100644 --- a/python/pathway/io/csv/__init__.py +++ b/python/pathway/io/csv/__init__.py @@ -60,13 +60,12 @@ def read( object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. This value will be \ deprecated soon, please use glob pattern in ``path`` instead. - with_metadata: When set to true, the connector will add an additional column \ -named ``_metadata`` to the table. This column will be a JSON field that will contain two \ -optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ -UNIX timestamps for the creation and modification time respectively. Additionally, the \ -column will also have an optional field named ``owner`` that will contain the name of \ -the file owner (applicable only for Un). Finally, the column will also contain a field \ -named ``path`` that will show the full path to the file from where a row was filled. + with_metadata: When set to true, the connector will add an additional column +named ``_metadata`` to the table. This JSON field may contain: (1) created_at - UNIX +timestamp of file creation; (2) modified_at - UNIX timestamp of last modification; +(3) seen_at is a UNIX timestamp of when they file was found by the engine; +(4) owner - Name of the file owner (only for Un); (5) path - Full file path of the +source row. (6) size - File size in bytes. types: Dictionary containing the mapping between the columns and the data types (``pw.Type``) of the values of those columns. This parameter is optional, and if not provided the default type is ``pw.Type.ANY``. [will be deprecated soon] diff --git a/python/pathway/io/fs/__init__.py b/python/pathway/io/fs/__init__.py index 79ba66d3..e9fd0507 100644 --- a/python/pathway/io/fs/__init__.py +++ b/python/pathway/io/fs/__init__.py @@ -84,13 +84,12 @@ def read( object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. This value will be \ deprecated soon, please use glob pattern in ``path`` instead. - with_metadata: When set to true, the connector will add an additional column \ -named ``_metadata`` to the table. This column will be a JSON field that will contain two \ -optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ -UNIX timestamps for the creation and modification time respectively. Additionally, the \ -column will also have an optional field named ``owner`` that will contain the name of \ -the file owner (applicable only for Un). Finally, the column will also contain a field \ -named ``path`` that will show the full path to the file from where a row was filled. + with_metadata: When set to true, the connector will add an additional column +named ``_metadata`` to the table. This JSON field may contain: (1) created_at - UNIX +timestamp of file creation; (2) modified_at - UNIX timestamp of last modification; +(3) seen_at is a UNIX timestamp of when they file was found by the engine; +(4) owner - Name of the file owner (only for Un); (5) path - Full file path of the +source row. (6) size - File size in bytes. persistent_id: (unstable) An identifier, under which the state of the table will be persisted or ``None``, if there is no need to persist the state of this table. When a program restarts, it restores the state for all input tables according to what