From b7f534aea2ea2bcd6670843e4837712e5df10595 Mon Sep 17 00:00:00 2001 From: "olivia.hess" Date: Tue, 4 Feb 2025 16:58:37 -0500 Subject: [PATCH] adding extra param for regex in find record by attribute --- nmdc_notebook_tools/collection_search.py | 10 ++++++++- nmdc_notebook_tools/data_processing.py | 26 +++++++++++++++++----- nmdc_notebook_tools/test/test_biosample.py | 20 ++++++++++------- pyproject.toml | 2 +- 4 files changed, 42 insertions(+), 16 deletions(-) diff --git a/nmdc_notebook_tools/collection_search.py b/nmdc_notebook_tools/collection_search.py index d08f13f..61f6401 100644 --- a/nmdc_notebook_tools/collection_search.py +++ b/nmdc_notebook_tools/collection_search.py @@ -112,6 +112,7 @@ def get_record_by_attribute( max_page_size=25, fields="", all_pages=False, + exact_match=False, ): """ Get a record from the NMDC API by its name. Records can be filtered based on their attributes found https://microbiomedata.github.io/nmdc-schema/. @@ -125,8 +126,15 @@ def get_record_by_attribute( fields: str The fields to return. Default is all fields. all_pages: bool + True to return all pages. False to return the first page. Default is False. + exact_match: bool + This var is used to determine if the inputted attribute value is an exact match or a partial match. Default is False, meaning the user does not need to input an exact match. + Under the hood this is used to determine if the inputted attribute value should be wrapped in a regex expression. """ - filter = f'{{"{attribute_name}":{{"$regex":"{attribute_value}"}}}}' + if exact_match: + filter = f'{{"{attribute_name}":"{attribute_value}"}}' + else: + filter = f'{{"{attribute_name}":{{"$regex":"{attribute_value}"}}}}' results = self.get_records(filter, max_page_size, fields, all_pages) return results diff --git a/nmdc_notebook_tools/data_processing.py b/nmdc_notebook_tools/data_processing.py index bf50046..1e609a8 100644 --- a/nmdc_notebook_tools/data_processing.py +++ b/nmdc_notebook_tools/data_processing.py @@ -9,6 +9,13 @@ class DataProcessing: def __init__(self): pass + def _string_mongo_list(self, data: list) -> str: + """ + Convert elements in a list to use double quotes instead of single quotes. + This is required for mongo queries. + """ + return str(data).replace("'", '"') + def convert_to_df(self, data: list) -> pd.DataFrame: """ Convert a list of dictionaries to a pandas dataframe. @@ -82,17 +89,24 @@ def identify_and_explode(df): merged_df.drop_duplicates(keep="first", inplace=True) return merged_df - def build_filter(self, attributes): + def build_filter(self, attributes, exact_match=False): """ Create a MongoDB filter using $regex for each attribute in the input dictionary. For nested attributes, use dot notation. Parameters: - attributes (dict): Dictionary of attribute names and their corresponding values to match using regex. - Example: {"name": "example", "description": "example", "geo_loc_name": "example"} + attributes (dict): Dictionary of attribute names and their corresponding values to match using regex. + Example: {"name": "example", "description": "example", "geo_loc_name": "example"} + exact_match: bool + This var is used to determine if the inputted attribute value is an exact match or a partial match. Default is False, meaning the user does not need to input an exact match. + Under the hood this is used to determine if the inputted attribute value should be wrapped in a regex expression. Returns: dict: A MongoDB filter dictionary. """ filter_dict = {} - for attribute_name, attribute_value in attributes.items(): - filter_dict[attribute_name] = {"$regex": attribute_value} - return self.string_mongo_list(filter_dict) + if exact_match: + for attribute_name, attribute_value in attributes.items(): + filter_dict[attribute_name] = attribute_value + else: + for attribute_name, attribute_value in attributes.items(): + filter_dict[attribute_name] = {"$regex": attribute_value} + return self._string_mongo_list(filter_dict) diff --git a/nmdc_notebook_tools/test/test_biosample.py b/nmdc_notebook_tools/test/test_biosample.py index 792e827..3a8df3d 100644 --- a/nmdc_notebook_tools/test/test_biosample.py +++ b/nmdc_notebook_tools/test/test_biosample.py @@ -2,6 +2,7 @@ from nmdc_notebook_tools.biosample_search import BiosampleSearch import logging from nmdc_notebook_tools.utils import Utils +from nmdc_notebook_tools.data_processing import DataProcessing def test_find_biosample_by_id(): @@ -25,8 +26,11 @@ def test_biosample_by_filter(): def test_biosample_by_attribute(): biosample = BiosampleSearch() - results = biosample.get_record_by_attribute("id", "nmdc:bsm-11-006pnx90") - assert len(results) > 0 + results = biosample.get_record_by_attribute( + "id", "nmdc:bsm-11-006pnx90", exact_match=True + ) + print(results) + assert len(results) == 1 def test_biosample_by_latitude(): @@ -55,18 +59,18 @@ def test_biosample_by_lat_long(): def test_biosample_build_filter_1(): - u = Utils() + u = DataProcessing() b = BiosampleSearch() filter = u.build_filter({"name": "G6R2_NF_20JUN2016"}) - results = b.biosample_by_filter(filter) + results = b.get_record_by_filter(filter) print(results) - assert len(results) > 0 + assert len(results) == 1 def test_biosample_build_filter_2(): - u = Utils() + u = DataProcessing() b = BiosampleSearch() filter = u.build_filter({"name": "G6R2_NF_20JUN2016", "id": "nmdc:bsm-11-006pnx90"}) - results = b.biosample_by_filter(filter) + results = b.get_record_by_filter(filter) print(results) - assert len(results) < 0 + assert len(results) == 1 diff --git a/pyproject.toml b/pyproject.toml index 7a35e88..8a31f14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "nmdc_notebook_tools" -version = "0.2.2" +version = "0.2.3" description = "A Python library for general research functions using NMDC APIs" authors = [ { name = "Olivia Hess", email = "olivia.hess@pnnl.gov" },