GSA · rshewitt · Nov 22, 2024 · Nov 21, 2024 · Nov 22, 2024
diff --git a/app/forms.py b/app/forms.py
@@ -37,7 +37,13 @@ class HarvestSourceForm(FlaskForm):
     )
     schema_type = SelectField(
         "Schema Type",
-        choices=["iso19115_1", "iso19115_2", "csdgm", "dcatus1.1"],
+        choices=[
+            "iso19115_1",
+            "iso19115_2",
+            "csdgm",
+            "dcatus1.1: federal",
+            "dcatus1.1: non-federal",
+        ],
         validators=[DataRequired()],
     )
     source_type = SelectField(

diff --git a/app/readme.txt b/app/readme.txt
@@ -43,7 +43,7 @@ curl -X POST http://{site}/harvest_source/add -H "Content-Type: application/json
     "notification_emails": "[email protected]",
     "frequency": "daily",
     "url": "http://example2.com",
-    "schema_type": "dcatus1.1",
+    "schema_type": "dcatus1.1: federal",
     "source_type": "json"
 }
 '

diff --git a/database/models.py b/database/models.py
@@ -53,9 +53,17 @@ class HarvestSource(db.Model):
         index=True,
     )
     schema_type = db.Column(
-        db.Enum("iso19115_1", "iso19115_2", "csdgm", "dcatus1.1", name="schema_type"),
+        db.Enum(
+            "iso19115_1",
+            "iso19115_2",
+            "csdgm",
+            "dcatus1.1: federal",
+            "dcatus1.1: non-federal",
+            name="schema_type",
+        ),
         nullable=False,
     )
+
     source_type = db.Column(
         db.Enum("document", "waf", name="source_type"), nullable=False
     )

diff --git a/example_data/dcatus/dcatus_single_record_non-federal.json b/example_data/dcatus/dcatus_single_record_non-federal.json
@@ -0,0 +1,33 @@
+{
+  "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
+  "@id": "http://www.cftc.gov/data.json",
+  "@type": "dcat:Catalog",
+  "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
+  "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
+  "dataset": [
+    {
+      "contactPoint": {
+        "fn": "Harold W. Hild",
+        "hasEmail": "mailto:[email protected]"
+      },
+      "describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm",
+      "description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC",
+      "distribution": [
+        {
+          "accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm"
+        }
+      ],
+      "modified": "R/P1W",
+      "publisher": {
+        "name": "U.S. Commodity Futures Trading Commission",
+        "subOrganizationOf": {
+          "name": "U.S. Government"
+        }
+      },
+      "title": "Commitment of Traders",
+      "accessLevel": "public",
+      "identifier": "cftc-dc1",
+      "keyword": ["commitment of traders", "cot", "open interest"]
+    }
+  ]
+}
diff --git a/harvester/harvest.py b/harvester/harvest.py
@@ -78,10 +78,7 @@ class HarvestSource:
         repr=False,
     )
 
-    _dataset_schema: dict = field(
-        default_factory=lambda: open_json(ROOT_DIR / "schemas" / "dataset.json"),
-        repr=False,
-    )
+    _dataset_schema: dict = field(default_factory=lambda: {}, repr=False)
     _no_harvest_resp: bool = False
 
     # not read-only because these values are added after initialization
@@ -100,6 +97,15 @@ def __post_init__(self) -> None:
         self._db_interface: HarvesterDBInterface = db_interface
         self.get_source_info_from_job_id(self.job_id)
 
+        if self.schema_type == "dcatus1.1: federal":
+            self.dataset_schema = open_json(
+                ROOT_DIR / "schemas" / "federal_dataset.json"
+            )
+        else:
+            self.dataset_schema = open_json(
+                ROOT_DIR / "schemas" / "non-federal_dataset.json"
+            )
+
     @property
     def job_id(self) -> str:
         return self._job_id
@@ -116,6 +122,12 @@ def source_attrs(self) -> list:
     def dataset_schema(self) -> dict:
         return self._dataset_schema
 
+    @dataset_schema.setter
+    def dataset_schema(self, value) -> None:
+        if not isinstance(value, dict):
+            raise ValueError("dataset schema must be a dict")
+        self._dataset_schema = value
+
     @property
     def no_harvest_resp(self) -> bool:
         return self._no_harvest_resp
@@ -151,7 +163,7 @@ def internal_records_to_id_hash(self, records: list[dict]) -> None:
 
     def get_record_identifier(self, record: dict) -> str:
 
-        record_id = "identifier" if self.schema_type == "dcatus1.1" else "url"
+        record_id = "identifier" if self.schema_type.startswith("dcatus") else "url"
 
         if record_id not in record:
             raise Exception
@@ -257,7 +269,7 @@ def write_compare_to_db(self) -> dict:
                 else:
                     record = self.external_records[record_id]
 
-                if self.schema_type == "dcatus1.1":
+                if self.schema_type.startswith("dcatus"):
                     source_raw = json.dumps(record.metadata)
                 else:
                     source_raw = record.metadata["content"]
@@ -320,7 +332,7 @@ def synchronize_records(self) -> None:
                     # no longer setting action in compare so setting it here...
                     record.action = action
 
-                    if self.schema_type != "dcatus1.1":
+                    if not self.schema_type.startswith("dcatus"):
                         record.transform()
                     record.validate()
                     record.sync()
@@ -454,7 +466,6 @@ class Record:
         default_factory=lambda: {
             "iso19115_1": "iso19115_1",
             "iso19115_2": "iso19115_2_datagov",
-            "dcatus1.1": "dcat_us",
             "csdgm": "fgdc",
         }
     )

diff --git a/schemas/dataset.json → schemas/federal_dataset.json b/schemas/dataset.json → schemas/federal_dataset.json