apache · damccorm · Jan 14, 2025 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -67,6 +67,7 @@
 ## New Features / Improvements
 
 * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
+* Add (Python) BigQuery vector/embedding ingestion and enrichment components to apache_beam.ml.rag.
 * Upgraded to protobuf 4 (Java) ([#33192](https://github.com/apache/beam/issues/33192)).
 
 ## Breaking Changes
@@ -107,6 +108,7 @@
 * Support OnWindowExpiration in Prism ([#32211](https://github.com/apache/beam/issues/32211)).
   * This enables initial Java GroupIntoBatches support.
 * Support OrderedListState in Prism ([#32929](https://github.com/apache/beam/issues/32929)).
+* Add (Python) apache_beam.ml.rag package with RAG types, base chunking, LangChain chunking and HuggingFace embedding components.
 
 ## Breaking Changes
 

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search.py b/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any
 from typing import Dict
@@ -187,12 +188,10 @@ def format_query(self, chunks: List[Chunk]) -> str:
     """ if self.metadata_restriction_template else ""
 
     # Group chunks by their metadata conditions
-    condition_groups = {}
+    condition_groups = defaultdict(list)
     if self.metadata_restriction_template:
       for chunk in chunks:
         condition = self.metadata_restriction_template.format(**chunk.metadata)
-        if condition not in condition_groups:
-          condition_groups[condition] = []
         condition_groups[condition].append(chunk)
     else:
       # No metadata filtering - all chunks in one group

diff --git a/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search_it_test.py
@@ -23,18 +23,21 @@
 import apache_beam as beam
 from apache_beam.io.gcp.bigquery_tools import BigQueryWrapper
 from apache_beam.io.gcp.internal.clients import bigquery
-from apache_beam.ml.rag.enrichment.bigquery_vector_search import BigQueryVectorSearchEnrichmentHandler
-from apache_beam.ml.rag.enrichment.bigquery_vector_search import BigQueryVectorSearchParameters
 from apache_beam.ml.rag.types import Chunk
 from apache_beam.ml.rag.types import Content
 from apache_beam.ml.rag.types import Embedding
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
-from apache_beam.transforms.enrichment import Enrichment
 
+# pylint: disable=ungrouped-imports
 try:
   from google.api_core.exceptions import BadRequest
+  from apache_beam.transforms.enrichment import Enrichment
+  from apache_beam.ml.rag.enrichment.bigquery_vector_search import \
+    BigQueryVectorSearchEnrichmentHandler
+  from apache_beam.ml.rag.enrichment.bigquery_vector_search import \
+    BigQueryVectorSearchParameters
 except ImportError:
   raise unittest.SkipTest('BigQuery dependencies not installed')
 

diff --git a/sdks/python/apache_beam/ml/rag/ingestion/bigquery.py b/sdks/python/apache_beam/ml/rag/ingestion/bigquery.py
@@ -18,7 +18,6 @@
 from dataclasses import dataclass
 from typing import Any
 from typing import Dict
-from typing import List
 from typing import Optional
 
 import apache_beam as beam
@@ -123,38 +122,62 @@ def create_write_transform(self) -> beam.PTransform:
     return _WriteToBigQueryVectorDatabase(self)
 
 
+def _default_chunk_to_dict_fn(chunk: Chunk):
+  if chunk.embedding is None or chunk.embedding.dense_embedding is None:
+    raise ValueError("chunk must contain dense embedding")
+  return {
+      'id': chunk.id,
+      'embedding': chunk.embedding.dense_embedding,
+      'content': chunk.content.text,
+      'metadata': [
+          {
+              "key": k, "value": str(v)
+          } for k, v in chunk.metadata.items()
+      ]
+  }
+
+
+def _default_schema():
+  return {
+      'fields': [{
+          'name': 'id', 'type': 'STRING'
+      }, {
+          'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED'
+      }, {
+          'name': 'content', 'type': 'STRING'
+      },
+                 {
+                     'name': 'metadata',
+                     'type': 'RECORD',
+                     'mode': 'REPEATED',
+                     'fields': [{
+                         'name': 'key', 'type': 'STRING'
+                     }, {
+                         'name': 'value', 'type': 'STRING'
+                     }]
+                 }]
+  }
+
+
 class _WriteToBigQueryVectorDatabase(beam.PTransform):
   """Implementation of BigQuery vector database write. """
   def __init__(self, config: BigQueryVectorWriterConfig):
     self.config = config
 
   def expand(self, pcoll: beam.PCollection[Chunk]):
-
-    if not self.config.schema_config:
-      rows_to_write = (
-          pcoll
-          | "Convert to schema'd Rows" >> beam.Map(
-              lambda chunk: beam.Row(
-                  id=chunk.id,
-                  embedding=chunk.embedding.dense_embedding,
-                  content=chunk.content.text,
-                  metadata=chunk.metadata)).with_output_types(
-                      RowTypeConstraint.from_fields(
-                          [('id', str), ('content', str),
-                           ('embedding', List[float]),
-                           ('metadata', Dict[str, str])])))
-    else:
-      schema = self.config.schema_config.schema
-      rows_to_write = (
-          pcoll
-          | "Chunk to dict" >> beam.Map(
-              self.config.schema_config.chunk_to_dict_fn)
-          | "Chunk dict to schema'd row" >> beam.Map(
-              lambda chunk_dict: beam_row_from_dict(
-                  row=chunk_dict, schema=schema)).with_output_types(
-                      RowTypeConstraint.from_fields(
-                          get_beam_typehints_from_tableschema(schema))))
+    schema = (
+        self.config.schema_config.schema
+        if self.config.schema_config else _default_schema())
+    chunk_to_dict_fn = (
+        self.config.schema_config.chunk_to_dict_fn
+        if self.config.schema_config else _default_chunk_to_dict_fn)
     return (
-        rows_to_write
+        pcoll
+        | "Chunk to dict" >> beam.Map(chunk_to_dict_fn)
+        | "Chunk dict to schema'd row" >> beam.Map(
+            lambda chunk_dict: beam_row_from_dict(
+                row=chunk_dict, schema=schema)).with_output_types(
+                    RowTypeConstraint.from_fields(
+                        get_beam_typehints_from_tableschema(schema)))
         | "Write to BigQuery" >> beam.managed.Write(
             beam.managed.BIGQUERY, config=self.config.write_config))
diff --git a/sdks/python/apache_beam/ml/rag/ingestion/bigquery_it_test.py b/sdks/python/apache_beam/ml/rag/ingestion/bigquery_it_test.py
@@ -108,6 +108,19 @@ def test_default_schema(self):
     with beam.Pipeline(argv=args) as p:
       _ = (p | beam.Create(chunks) | config.create_write_transform())
 
+  def test_default_schema_missing_embedding(self):
+    table_name = 'python_default_schema_table'
+    table_id = '{}.{}.{}'.format(self.project, self.dataset_id, table_name)
+
+    config = BigQueryVectorWriterConfig(write_config={'table': table_id})
+    chunks = [
+        Chunk(id="1", content=Content(text="foo"), metadata={"a": "b"}),
+        Chunk(id="2", content=Content(text="bar"), metadata={"c": "d"})
+    ]
+    with self.assertRaises(ValueError):
+      with beam.Pipeline() as p:
+        _ = (p | beam.Create(chunks) | config.create_write_transform())
+
   def test_custom_schema(self):
     table_name = 'python_custom_schema_table'
     table_id = '{}.{}.{}'.format(self.project, self.dataset_id, table_name)