From b7f534aea2ea2bcd6670843e4837712e5df10595 Mon Sep 17 00:00:00 2001
From: "olivia.hess" <olivia.hess@pnnl.gov>
Date: Tue, 4 Feb 2025 16:58:37 -0500
Subject: [PATCH] adding extra param for regex in find record by attribute

---
 nmdc_notebook_tools/collection_search.py   | 10 ++++++++-
 nmdc_notebook_tools/data_processing.py     | 26 +++++++++++++++++-----
 nmdc_notebook_tools/test/test_biosample.py | 20 ++++++++++-------
 pyproject.toml                             |  2 +-
 4 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/nmdc_notebook_tools/collection_search.py b/nmdc_notebook_tools/collection_search.py
index d08f13f..61f6401 100644
--- a/nmdc_notebook_tools/collection_search.py
+++ b/nmdc_notebook_tools/collection_search.py
@@ -112,6 +112,7 @@ def get_record_by_attribute(
         max_page_size=25,
         fields="",
         all_pages=False,
+        exact_match=False,
     ):
         """
         Get a record from the NMDC API by its name. Records can be filtered based on their attributes found https://microbiomedata.github.io/nmdc-schema/.
@@ -125,8 +126,15 @@ def get_record_by_attribute(
             fields: str
                 The fields to return. Default is all fields.
             all_pages: bool
+                True to return all pages. False to return the first page. Default is False.
+            exact_match: bool
+                This var is used to determine if the inputted attribute value is an exact match or a partial match. Default is False, meaning the user does not need to input an exact match.
+                Under the hood this is used to determine if the inputted attribute value should be wrapped in a regex expression.
         """
-        filter = f'{{"{attribute_name}":{{"$regex":"{attribute_value}"}}}}'
+        if exact_match:
+            filter = f'{{"{attribute_name}":"{attribute_value}"}}'
+        else:
+            filter = f'{{"{attribute_name}":{{"$regex":"{attribute_value}"}}}}'
         results = self.get_records(filter, max_page_size, fields, all_pages)
         return results
 
diff --git a/nmdc_notebook_tools/data_processing.py b/nmdc_notebook_tools/data_processing.py
index bf50046..1e609a8 100644
--- a/nmdc_notebook_tools/data_processing.py
+++ b/nmdc_notebook_tools/data_processing.py
@@ -9,6 +9,13 @@ class DataProcessing:
     def __init__(self):
         pass
 
+    def _string_mongo_list(self, data: list) -> str:
+        """
+        Convert elements in a list to use double quotes instead of single quotes.
+        This is required for mongo queries.
+        """
+        return str(data).replace("'", '"')
+
     def convert_to_df(self, data: list) -> pd.DataFrame:
         """
         Convert a list of dictionaries to a pandas dataframe.
@@ -82,17 +89,24 @@ def identify_and_explode(df):
         merged_df.drop_duplicates(keep="first", inplace=True)
         return merged_df
 
-    def build_filter(self, attributes):
+    def build_filter(self, attributes, exact_match=False):
         """
         Create a MongoDB filter using $regex for each attribute in the input dictionary. For nested attributes, use dot notation.
 
         Parameters:
-        attributes (dict): Dictionary of attribute names and their corresponding values to match using regex.
-            Example: {"name": "example", "description": "example", "geo_loc_name": "example"}
+            attributes (dict): Dictionary of attribute names and their corresponding values to match using regex.
+                Example: {"name": "example", "description": "example", "geo_loc_name": "example"}
+            exact_match: bool
+                This var is used to determine if the inputted attribute value is an exact match or a partial match. Default is False, meaning the user does not need to input an exact match.
+                Under the hood this is used to determine if the inputted attribute value should be wrapped in a regex expression.
         Returns:
         dict: A MongoDB filter dictionary.
         """
         filter_dict = {}
-        for attribute_name, attribute_value in attributes.items():
-            filter_dict[attribute_name] = {"$regex": attribute_value}
-        return self.string_mongo_list(filter_dict)
+        if exact_match:
+            for attribute_name, attribute_value in attributes.items():
+                filter_dict[attribute_name] = attribute_value
+        else:
+            for attribute_name, attribute_value in attributes.items():
+                filter_dict[attribute_name] = {"$regex": attribute_value}
+        return self._string_mongo_list(filter_dict)
diff --git a/nmdc_notebook_tools/test/test_biosample.py b/nmdc_notebook_tools/test/test_biosample.py
index 792e827..3a8df3d 100644
--- a/nmdc_notebook_tools/test/test_biosample.py
+++ b/nmdc_notebook_tools/test/test_biosample.py
@@ -2,6 +2,7 @@
 from nmdc_notebook_tools.biosample_search import BiosampleSearch
 import logging
 from nmdc_notebook_tools.utils import Utils
+from nmdc_notebook_tools.data_processing import DataProcessing
 
 
 def test_find_biosample_by_id():
@@ -25,8 +26,11 @@ def test_biosample_by_filter():
 
 def test_biosample_by_attribute():
     biosample = BiosampleSearch()
-    results = biosample.get_record_by_attribute("id", "nmdc:bsm-11-006pnx90")
-    assert len(results) > 0
+    results = biosample.get_record_by_attribute(
+        "id", "nmdc:bsm-11-006pnx90", exact_match=True
+    )
+    print(results)
+    assert len(results) == 1
 
 
 def test_biosample_by_latitude():
@@ -55,18 +59,18 @@ def test_biosample_by_lat_long():
 
 
 def test_biosample_build_filter_1():
-    u = Utils()
+    u = DataProcessing()
     b = BiosampleSearch()
     filter = u.build_filter({"name": "G6R2_NF_20JUN2016"})
-    results = b.biosample_by_filter(filter)
+    results = b.get_record_by_filter(filter)
     print(results)
-    assert len(results) > 0
+    assert len(results) == 1
 
 
 def test_biosample_build_filter_2():
-    u = Utils()
+    u = DataProcessing()
     b = BiosampleSearch()
     filter = u.build_filter({"name": "G6R2_NF_20JUN2016", "id": "nmdc:bsm-11-006pnx90"})
-    results = b.biosample_by_filter(filter)
+    results = b.get_record_by_filter(filter)
     print(results)
-    assert len(results) < 0
+    assert len(results) == 1
diff --git a/pyproject.toml b/pyproject.toml
index 7a35e88..8a31f14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "nmdc_notebook_tools"
-version = "0.2.2"
+version = "0.2.3"
 description = "A Python library for general research functions using NMDC APIs"
 authors = [
     { name = "Olivia Hess", email = "olivia.hess@pnnl.gov" },