feat: ✨ doi registration (#9)

* 🚧 wip: add doi registration * 🚨 chore: appease the linter * 🎨 style: fix code style issues with Black * ✨ feat: add doi generation from test.datacite * 🚧 wip: create payload to register datasets doi * chore: generate mermaid diagrams Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * 🎨 style: fix code style issues with Black * 🚧 wip: gather all date for doi registration payload * 🚧 wip: correcting payload for doi registration * 🎨 style: fix code style issues with Black * ✨ feat: doi registration complete * 🎨 style: fix code style issues with Black --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sanjay Soundarajan <[email protected]> Co-authored-by: Lint Action <[email protected]> Co-authored-by: slugb0t <[email protected]>
AI-READI · Feb 27, 2024 · 3510fe6 · 3510fe6
1 parent 188f97b
commit 3510fe6
Show file tree

Hide file tree

Showing 10 changed files with 579 additions and 54 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,31 @@
+name: Lint
+
+on: [push]
+
+jobs:
+  lint:
+    name: Run linters
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Lint with flake8
+        run: poe flake8
+
+      - name: Type check with mypy
+        run: poe typecheck
+
+      - name: Lint with pylint
+        run: poe pylint
diff --git a/config.py b/config.py
@@ -1,4 +1,5 @@
 """Configuration for the application."""
+
 from os import environ
 from pathlib import Path
 
@@ -32,5 +33,15 @@ def get_env(key, optional=False):
 FAIRHUB_DATABASE_USER = get_env("FAIRHUB_DATABASE_USER", optional=True)
 FAIRHUB_DATABASE_PORT = get_env("FAIRHUB_DATABASE_PORT", optional=True)
 
+FAIRHUB_ENVIRONMENT = get_env("FAIRHUB_ENVIRONMENT")
+
+DATACITE_CREDENTIALS = get_env("DATACITE_CREDENTIALS", optional=True)
+
 AZURE_STORAGE_ACCESS_KEY = get_env("AZURE_STORAGE_ACCESS_KEY")
 AZURE_STORAGE_CONNECTION_STRING = get_env("AZURE_STORAGE_CONNECTION_STRING")
+
+DATACITE_API_URL = "https://api.datacite.org"
+
+if FAIRHUB_ENVIRONMENT in ["staging", "dev"]:
+    # Using the test environment for DataCite
+    DATACITE_API_URL = "https://api.test.datacite.org"
diff --git a/function_app.py b/function_app.py
@@ -1,8 +1,9 @@
 """Azure Function App for ETL pipeline."""
+
 import logging
+
 import azure.functions as func
 
-from utils import file_operations
 from publish_pipeline.generate_high_level_metadata.generate_changelog import (
     pipeline as generate_changelog_pipeline,
 )
@@ -24,10 +25,12 @@
 from publish_pipeline.generate_high_level_metadata.generate_study_description import (
     pipeline as generate_study_description_pipeline,
 )
+from publish_pipeline.register_doi.register_doi import pipeline as register_doi_pipeline
 from stage_one.env_sensor_pipeline import pipeline as stage_one_env_sensor_pipeline
 from stage_one.img_identifier_pipeline import (
     pipeline as stage_one_img_identifier_pipeline,
 )
+from utils import file_operations
 
 app = func.FunctionApp()
 
@@ -166,6 +169,18 @@ def generate_discovery_metadata(req: func.HttpRequest) -> func.HttpResponse:
         return func.HttpResponse("Failed", status_code=500, mimetype="application/json")
 
 
+@app.route(route="register-doi", auth_level=func.AuthLevel.FUNCTION)
+def register_doi(req: func.HttpRequest) -> func.HttpResponse:
+    """Registers a DOI for the study."""
+
+    try:
+        register_doi_pipeline()
+        return func.HttpResponse("Success", status_code=200, mimetype="text/plain")
+    except Exception as e:
+        print(f"Exception: {e}")
+        return func.HttpResponse("Failed", status_code=500, mimetype="text/plain")
+
+
 @app.route(route="moving-folders", auth_level=func.AuthLevel.FUNCTION)
 def moving_folders(req: func.HttpRequest) -> func.HttpResponse:
     """Moves the directories along with the files in the Azure Database."""
@@ -176,5 +191,3 @@ def moving_folders(req: func.HttpRequest) -> func.HttpResponse:
 def copying_folders(req: func.HttpRequest) -> func.HttpResponse:
     """Copies the directories along with the files in the Azure Database."""
     return file_operations.file_operation(file_operations.copy_directory, req)
-
-
diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py
@@ -43,10 +43,14 @@ def pipeline():
 
     identifier = {}
 
-    # todo: generating a random uuid for now
-    # todo: replace with the actual doi when we have it
     # Get the dataset identifier
-    identifier["identifierValue"] = str(uuid.uuid4())
+    cur.execute(
+        "SELECT doi FROM version WHERE dataset_id = %s",
+        (dataset_id,),
+    )
+
+    doi = cur.fetchone()
+    identifier["identifierValue"] = doi[0]
     identifier["identifierType"] = "DOI"
 
     dataset_metadata["Identifier"] = identifier
@@ -487,19 +491,19 @@ def pipeline():
             item = {}
 
             item["funderName"] = funding_reference[0]
-
             item["funderIdentifier"] = {}
             item["funderIdentifier"]["funderIdentifierValue"] = funding_reference[1]
+
             if funding_reference[2] is not None and funding_reference[2] != "":
                 item["funderIdentifier"]["funderIdentifierType"] = funding_reference[2]
             if funding_reference[3] is not None and funding_reference[3] != "":
                 item["funderIdentifier"]["schemeURI"] = funding_reference[3]
 
             item["awardNumber"] = {}
             item["awardNumber"]["awardNumberValue"] = funding_reference[4]
+
             if funding_reference[5] is not None and funding_reference[5] != "":
                 item["awardNumber"]["awardURI"] = funding_reference[5]
-
             if funding_reference[6] is not None and funding_reference[6] != "":
                 item["awardTitle"] = funding_reference[6]
 
@@ -540,19 +544,19 @@ def pipeline():
                 for related_item_identifier in related_item_identifiers:
                     item_identifier = {}
 
-                    item_identifier[
-                        "relatedItemIdentifierValue"
-                    ] = related_item_identifier[0]
-                    item_identifier[
-                        "relatedItemIdentifierType"
-                    ] = related_item_identifier[1]
+                    item_identifier["relatedItemIdentifierValue"] = (
+                        related_item_identifier[0]
+                    )
+                    item_identifier["relatedItemIdentifierType"] = (
+                        related_item_identifier[1]
+                    )
                     if (
                         related_item_identifier[2] is not None
                         and related_item_identifier[2] != ""
                     ):
-                        item_identifier[
-                            "relatedMetadataScheme"
-                        ] = related_item_identifier[2]
+                        item_identifier["relatedMetadataScheme"] = (
+                            related_item_identifier[2]
+                        )
                     if (
                         related_item_identifier[3] is not None
                         and related_item_identifier[3] != ""

diff --git a/publish_pipeline/generate_high_level_metadata/generate_datatype_dictionary.py b/publish_pipeline/generate_high_level_metadata/generate_datatype_dictionary.py
@@ -45,6 +45,7 @@ def pipeline():
             pooled_data_folders.append(blob.name.split("/")[2])
 
     # print(pooled_data_folders)
+    # Replacing the list of folders from DB with a hardcoded list
     pooled_data_folders = ["ekg", "redcap_data", "oct"]
 
     # Create a temporary folder on the local machine

diff --git a/publish_pipeline/generate_high_level_metadata/generate_study_description.py b/publish_pipeline/generate_high_level_metadata/generate_study_description.py
@@ -49,25 +49,25 @@ def pipeline():
     identification_module["OrgStudyIdInfo"] = {}
 
     # Study Identifier
-    identification_module["OrgStudyIdInfo"][
-        "OrgStudyId"
-    ] = primary_study_identification[0]
+    identification_module["OrgStudyIdInfo"]["OrgStudyId"] = (
+        primary_study_identification[0]
+    )
     # Study Identifier Type
-    identification_module["OrgStudyIdInfo"][
-        "OrgStudyIdType"
-    ] = primary_study_identification[1]
+    identification_module["OrgStudyIdInfo"]["OrgStudyIdType"] = (
+        primary_study_identification[1]
+    )
 
     if primary_study_identification[2] and primary_study_identification[2] != "":
         # Study Identifier Domain
-        identification_module["OrgStudyIdInfo"][
-            "OrgStudyIdDomain"
-        ] = primary_study_identification[2]
+        identification_module["OrgStudyIdInfo"]["OrgStudyIdDomain"] = (
+            primary_study_identification[2]
+        )
 
     if primary_study_identification[3] and primary_study_identification[3] != "":
         # Study Identifier Link
-        identification_module["OrgStudyIdInfo"][
-            "OrgStudyIdLink"
-        ] = primary_study_identification[3]
+        identification_module["OrgStudyIdInfo"]["OrgStudyIdLink"] = (
+            primary_study_identification[3]
+        )
 
     # Get the secondary study identification metadata
     cur.execute(
@@ -235,18 +235,18 @@ def pipeline():
         design_module["DesignInfo"]["DesignAllocation"] = study_design[1]
         design_module["DesignInfo"]["DesignInterventionModel"] = study_design[2]
         if study_design[3] and study_design[3] != "":
-            design_module["DesignInfo"][
-                "DesignInterventionModelDescription"
-            ] = study_design[3]
+            design_module["DesignInfo"]["DesignInterventionModelDescription"] = (
+                study_design[3]
+            )
         design_module["DesignInfo"]["DesignPrimaryPurpose"] = study_design[4]
 
         design_module["DesignInfo"]["DesignMaskingInfo"] = {}
-        design_module["DesignInfo"]["DesignMaskingInfo"][
-            "DesignMasking"
-        ] = study_design[5]
-        design_module["DesignInfo"]["DesignMaskingInfo"][
-            "DesignMaskingDescription"
-        ] = study_design[6]
+        design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMasking"] = (
+            study_design[5]
+        )
+        design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMaskingDescription"] = (
+            study_design[6]
+        )
 
         design_module["DesignInfo"]["DesignMaskingInfo"]["DesignWhoMaskedList"] = []