Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add fed & non-fed dcatus1.1 schemas #114

Merged
merged 2 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion app/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ class HarvestSourceForm(FlaskForm):
)
schema_type = SelectField(
"Schema Type",
choices=["iso19115_1", "iso19115_2", "csdgm", "dcatus1.1"],
choices=[
"iso19115_1",
"iso19115_2",
"csdgm",
"dcatus1.1: federal",
"dcatus1.1: non-federal",
],
validators=[DataRequired()],
)
source_type = SelectField(
Expand Down
2 changes: 1 addition & 1 deletion app/readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ curl -X POST http://{site}/harvest_source/add -H "Content-Type: application/json
"notification_emails": "[email protected]",
"frequency": "daily",
"url": "http://example2.com",
"schema_type": "dcatus1.1",
"schema_type": "dcatus1.1: federal",
"source_type": "json"
}
'
Expand Down
10 changes: 9 additions & 1 deletion database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ class HarvestSource(db.Model):
index=True,
)
schema_type = db.Column(
db.Enum("iso19115_1", "iso19115_2", "csdgm", "dcatus1.1", name="schema_type"),
db.Enum(
"iso19115_1",
"iso19115_2",
"csdgm",
"dcatus1.1: federal",
"dcatus1.1: non-federal",
name="schema_type",
),
nullable=False,
)

source_type = db.Column(
db.Enum("document", "waf", name="source_type"), nullable=False
)
Expand Down
33 changes: 33 additions & 0 deletions example_data/dcatus/dcatus_single_record_non-federal.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"@id": "http://www.cftc.gov/data.json",
"@type": "dcat:Catalog",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"dataset": [
{
"contactPoint": {
"fn": "Harold W. Hild",
"hasEmail": "mailto:[email protected]"
},
"describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm",
"description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC",
"distribution": [
{
"accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm"
}
],
"modified": "R/P1W",
"publisher": {
"name": "U.S. Commodity Futures Trading Commission",
"subOrganizationOf": {
"name": "U.S. Government"
}
},
"title": "Commitment of Traders",
"accessLevel": "public",
"identifier": "cftc-dc1",
"keyword": ["commitment of traders", "cot", "open interest"]
}
]
}
27 changes: 19 additions & 8 deletions harvester/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,7 @@ class HarvestSource:
repr=False,
)

_dataset_schema: dict = field(
default_factory=lambda: open_json(ROOT_DIR / "schemas" / "dataset.json"),
repr=False,
)
_dataset_schema: dict = field(default_factory=lambda: {}, repr=False)
_no_harvest_resp: bool = False

# not read-only because these values are added after initialization
Expand All @@ -100,6 +97,15 @@ def __post_init__(self) -> None:
self._db_interface: HarvesterDBInterface = db_interface
self.get_source_info_from_job_id(self.job_id)

if self.schema_type == "dcatus1.1: federal":
self.dataset_schema = open_json(
ROOT_DIR / "schemas" / "federal_dataset.json"
)
else:
self.dataset_schema = open_json(
ROOT_DIR / "schemas" / "non-federal_dataset.json"
)

@property
def job_id(self) -> str:
return self._job_id
Expand All @@ -116,6 +122,12 @@ def source_attrs(self) -> list:
def dataset_schema(self) -> dict:
return self._dataset_schema

@dataset_schema.setter
def dataset_schema(self, value) -> None:
if not isinstance(value, dict):
raise ValueError("dataset schema must be a dict")
self._dataset_schema = value

@property
def no_harvest_resp(self) -> bool:
return self._no_harvest_resp
Expand Down Expand Up @@ -151,7 +163,7 @@ def internal_records_to_id_hash(self, records: list[dict]) -> None:

def get_record_identifier(self, record: dict) -> str:

record_id = "identifier" if self.schema_type == "dcatus1.1" else "url"
record_id = "identifier" if self.schema_type.startswith("dcatus") else "url"

if record_id not in record:
raise Exception
Expand Down Expand Up @@ -257,7 +269,7 @@ def write_compare_to_db(self) -> dict:
else:
record = self.external_records[record_id]

if self.schema_type == "dcatus1.1":
if self.schema_type.startswith("dcatus"):
source_raw = json.dumps(record.metadata)
else:
source_raw = record.metadata["content"]
Expand Down Expand Up @@ -320,7 +332,7 @@ def synchronize_records(self) -> None:
# no longer setting action in compare so setting it here...
record.action = action

if self.schema_type != "dcatus1.1":
if not self.schema_type.startswith("dcatus"):
record.transform()
record.validate()
record.sync()
Expand Down Expand Up @@ -454,7 +466,6 @@ class Record:
default_factory=lambda: {
"iso19115_1": "iso19115_1",
"iso19115_2": "iso19115_2_datagov",
"dcatus1.1": "dcat_us",
"csdgm": "fgdc",
}
)
Expand Down
File renamed without changes.
Loading
Loading