feat: add basic facet view (term bucket with # of items for each value)

openfoodfacts · Oct 17, 2023 · 930b335 · 930b335
1 parent 7735dac
commit 930b335
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 1 deletion.
diff --git a/app/config.py b/app/config.py
@@ -178,6 +178,14 @@ class FieldConfig(BaseModel):
             "field are provided."
         ),
     ] = False
+    bucket_agg: Annotated[
+        bool,
+        Field(
+            description="do we add an bucket aggregation to the elasticsearch query for this field. "
+            "It is used to return a 'faceted-view' with the number of results for each facet value. "
+            "Only valid for keyword or numeric field types."
+        ),
+    ] = False
     taxonomy_name: Annotated[
         str | None, Field(description="only for taxonomy field type")
     ] = None
@@ -195,6 +203,18 @@ def taxonomy_name_should_be_used_for_taxonomy_type_only(self):
             raise ValueError("taxonomy_name should be provided for taxonomy type only")
         return self
 
+    @model_validator(mode="after")
+    def bucket_agg_should_be_used_for_keyword_and_numeric_types_only(self):
+        """Validator that checks that `bucket_agg` is only provided for
+        fields with types `keyword`, `double`, `float`, `integer` or `bool`."""
+        if self.bucket_agg and not (
+            self.type.is_numeric() or self.type in (FieldType.keyword, FieldType.bool)
+        ):
+            raise ValueError(
+                "bucket_agg should be provided for taxonomy or numeric type only"
+            )
+        return self
+
     def get_input_field(self):
         """Return the name of the field to use in input data."""
         return self.input_field or self.name

diff --git a/app/postprocessing.py b/app/postprocessing.py
@@ -38,6 +38,7 @@ def process(self, response: Response, projection: set[str] | None) -> JSONType:
                 result = dict((k, v) for k, v in result.items() if k in projection)
             hits.append(result)
         output["hits"] = hits
+        output["aggregations"] = response.aggregations.to_dict()
         return output
 
     def process_after(self, result: JSONType) -> JSONType:

diff --git a/app/query.py b/app/query.py
@@ -1,5 +1,6 @@
 import elastic_transport
-from elasticsearch_dsl import Q, Search
+from elasticsearch_dsl import A, Q, Search
+from elasticsearch_dsl.aggs import Agg
 from elasticsearch_dsl.query import Query
 from luqum import visitor
 from luqum.elasticsearch import ElasticsearchQueryBuilder
@@ -192,6 +193,17 @@ def parse_sort_by_parameter(sort_by: str | None, config: Config) -> str | None:
     return sort_by
 
 
+def create_aggregation_clauses(config: Config) -> dict[str, Agg]:
+    """Create term bucket aggregation clauses for all relevant fields as
+    defined in the config.
+    """
+    clauses = {}
+    for field in config.fields.values():
+        if field.bucket_agg:
+            clauses[field.name] = A("terms", field=field.name)
+    return clauses
+
+
 def build_search_query(
     q: str,
     langs: set[str],
@@ -226,6 +238,9 @@ def build_search_query(
     if filter_query:
         query = query.query("bool", filter=filter_query)
 
+    for agg_name, agg in create_aggregation_clauses(config).items():
+        query.aggs.bucket(agg_name, agg)
+
     sort_by = parse_sort_by_parameter(sort_by, config)
     if sort_by is not None:
         query = query.sort(sort_by)

diff --git a/app/types.py b/app/types.py
@@ -26,6 +26,7 @@ def is_success(self):
 
 class SuccessSearchResponse(BaseModel):
     hits: list[JSONType]
+    aggregations: JSONType
     page: int
     page_size: int
     page_count: int

diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml
@@ -33,6 +33,9 @@ fields:
     full_text_search: true
     split: true
     type: text
+  brands_tags:
+    type: keyword
+    bucket_agg: true
   stores:
     split: true
     type: text
@@ -41,20 +44,26 @@ fields:
     type: text
   lang:
     type: keyword
+    bucket_agg: true
   lc:
     type: keyword
   owner:
     type: keyword
+    bucket_agg: true
   quantity:
     type: text
   categories_tags:
     type: keyword
+    bucket_agg: true
   labels_tags:
     type: keyword
+    bucket_agg: true
   countries_tags:
     type: keyword
+    bucket_agg: true
   states_tags:
     type: keyword
+    bucket_agg: true
   origins_tags:
     type: keyword
   ingredients_tags:
@@ -65,10 +74,13 @@ fields:
     type: integer
   nutrition_grades:
     type: keyword
+    bucket_agg: true
   ecoscore_grade:
     type: keyword
+    bucket_agg: true
   nova_groups:
     type: keyword
+    bucket_agg: true
   last_modified_t:
     type: date
   created_t:

diff --git a/tests/unit/data/complex_query.json b/tests/unit/data/complex_query.json
@@ -113,6 +113,58 @@
       "minimum_should_match": 1
     }
   },
+  "aggs": {
+    "brands_tags": {
+      "terms": {
+        "field": "brands_tags"
+      }
+    },
+    "lang": {
+      "terms": {
+        "field": "lang"
+      }
+    },
+    "owner": {
+      "terms": {
+        "field": "owner"
+      }
+    },
+    "categories_tags": {
+      "terms": {
+        "field": "categories_tags"
+      }
+    },
+    "labels_tags": {
+      "terms": {
+        "field": "labels_tags"
+      }
+    },
+    "countries_tags": {
+      "terms": {
+        "field": "countries_tags"
+      }
+    },
+    "states_tags": {
+      "terms": {
+        "field": "states_tags"
+      }
+    },
+    "nutrition_grades": {
+      "terms": {
+        "field": "nutrition_grades"
+      }
+    },
+    "ecoscore_grade": {
+      "terms": {
+        "field": "ecoscore_grade"
+      }
+    },
+    "nova_groups": {
+      "terms": {
+        "field": "nova_groups"
+      }
+    }
+  },
   "size": 25,
   "from": 25
 }
diff --git a/tests/unit/data/non_existing_filter_field.json b/tests/unit/data/non_existing_filter_field.json
@@ -13,6 +13,58 @@
       ]
     }
   },
+  "aggs": {
+    "brands_tags": {
+      "terms": {
+        "field": "brands_tags"
+      }
+    },
+    "lang": {
+      "terms": {
+        "field": "lang"
+      }
+    },
+    "owner": {
+      "terms": {
+        "field": "owner"
+      }
+    },
+    "categories_tags": {
+      "terms": {
+        "field": "categories_tags"
+      }
+    },
+    "labels_tags": {
+      "terms": {
+        "field": "labels_tags"
+      }
+    },
+    "countries_tags": {
+      "terms": {
+        "field": "countries_tags"
+      }
+    },
+    "states_tags": {
+      "terms": {
+        "field": "states_tags"
+      }
+    },
+    "nutrition_grades": {
+      "terms": {
+        "field": "nutrition_grades"
+      }
+    },
+    "ecoscore_grade": {
+      "terms": {
+        "field": "ecoscore_grade"
+      }
+    },
+    "nova_groups": {
+      "terms": {
+        "field": "nova_groups"
+      }
+    }
+  },
   "size": 25,
   "from": 25
 }
diff --git a/tests/unit/data/openfoodfacts_config.yml b/tests/unit/data/openfoodfacts_config.yml
@@ -33,6 +33,9 @@ fields:
     full_text_search: true
     split: true
     type: text
+  brands_tags:
+    type: keyword
+    bucket_agg: true
   stores:
     split: true
     type: text
@@ -41,20 +44,26 @@ fields:
     type: text
   lang:
     type: keyword
+    bucket_agg: true
   lc:
     type: keyword
   owner:
     type: keyword
+    bucket_agg: true
   quantity:
     type: text
   categories_tags:
     type: keyword
+    bucket_agg: true
   labels_tags:
     type: keyword
+    bucket_agg: true
   countries_tags:
     type: keyword
+    bucket_agg: true
   states_tags:
     type: keyword
+    bucket_agg: true
   origins_tags:
     type: keyword
   ingredients_tags:
@@ -65,10 +74,13 @@ fields:
     type: integer
   nutrition_grades:
     type: keyword
+    bucket_agg: true
   ecoscore_grade:
     type: keyword
+    bucket_agg: true
   nova_groups:
     type: keyword
+    bucket_agg: true
   last_modified_t:
     type: date
   created_t:

diff --git a/tests/unit/data/simple_filter_query.json b/tests/unit/data/simple_filter_query.json
@@ -12,6 +12,58 @@
       ]
     }
   },
+  "aggs": {
+    "brands_tags": {
+      "terms": {
+        "field": "brands_tags"
+      }
+    },
+    "lang": {
+      "terms": {
+        "field": "lang"
+      }
+    },
+    "owner": {
+      "terms": {
+        "field": "owner"
+      }
+    },
+    "categories_tags": {
+      "terms": {
+        "field": "categories_tags"
+      }
+    },
+    "labels_tags": {
+      "terms": {
+        "field": "labels_tags"
+      }
+    },
+    "countries_tags": {
+      "terms": {
+        "field": "countries_tags"
+      }
+    },
+    "states_tags": {
+      "terms": {
+        "field": "states_tags"
+      }
+    },
+    "nutrition_grades": {
+      "terms": {
+        "field": "nutrition_grades"
+      }
+    },
+    "ecoscore_grade": {
+      "terms": {
+        "field": "ecoscore_grade"
+      }
+    },
+    "nova_groups": {
+      "terms": {
+        "field": "nova_groups"
+      }
+    }
+  },
   "size": 25,
   "from": 25
 }