Skip to content

Commit

Permalink
Update wms harvester to be able to save to geonode
Browse files Browse the repository at this point in the history
  • Loading branch information
meomancer committed Jul 9, 2021
1 parent 8e97218 commit 75ffa0e
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 54 deletions.
9 changes: 1 addition & 8 deletions geonode/harvesting/harvesters/geonode.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from geonode.layers.models import Layer
from geonode.base.models import ResourceBase
from geonode.documents.models import Document
from geonode.harvesting.utils import get_xpath_value

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -705,14 +706,6 @@ def get_temporal_extent(
return result


def get_xpath_value(
element: etree.Element,
xpath_expression: str,
) -> typing.Optional[str]:
values = element.xpath(f"{xpath_expression}//text()", namespaces=element.nsmap)
return "".join(values).strip() or None


def _get_optional_attribute_value(
element: etree.Element, xpath: str) -> typing.Optional[str]:
return element.xpath(f"{xpath}/text()", namespaces=element.nsmap)[0].strip() or None
231 changes: 186 additions & 45 deletions geonode/harvesting/harvesters/wms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,23 @@
import typing
import logging
import requests
from urllib.parse import urlencode
import uuid

from datetime import datetime
from lxml import etree

from django.contrib.gis import geos
from geonode.base.models import ResourceBase
from geonode.layers.models import Layer

from . import base
from ..models import Harvester
from ..utils import XML_PARSER
from .. import resourcedescriptor

from geonode.harvesting.utils import get_xpath_value

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -86,13 +95,34 @@ def get_extra_config_schema(cls) -> typing.Optional[typing.Dict]:
}

def get_num_available_resources(self) -> int:
raise NotImplementedError
data = self._get_data()
return len(data['layers'])

def list_resources(
self,
offset: typing.Optional[int] = 0
) -> typing.List[base.BriefRemoteResource]:
raise NotImplementedError

# look at `tasks.update_harvestable_resources()` in order to understand the purpose of the
# `offset` parameter. Briefly, we try to retrieve resources in batches and we use `offset` to
# control the pagination of the remote service. Unfortunately WMS does not really have the
# concept of pagination and dumps all available layers in a single `GetCapabilities` response.
# With this in mind, we only handle the case where `offset == 0`, which returns all available resources
# and simply return an empty list when `offset != 0`
if offset != 0:
return []

resources = []
data = self._get_data()
for layer in data['layers']:
resources.append(
base.BriefRemoteResource(
unique_identifier=layer['name'],
title=layer['title'],
resource_type='layers',
)
)
return resources

def check_availability(self, timeout_seconds: typing.Optional[int] = 5) -> bool:
try:
Expand All @@ -104,66 +134,177 @@ def check_availability(self, timeout_seconds: typing.Optional[int] = 5) -> bool:
result = True
return result

def get_geonode_resource_type(self, remote_resource_type: str) -> ResourceBase:
"""Return resource type class from resource type string."""
# WMS just have Layer type on it's resource.
# So whatever remote_resource_type it is, it always return Layer.
return Layer

def get_resource(
self,
resource_unique_identifier: str,
remote_resource_type: str,
harvesting_session_id: typing.Optional[int] = None
) -> typing.Optional[resourcedescriptor.RecordDescription]:
params = self._base_wms_parameters
harvestable_resource: "HarvestableResource", # noqa
harvesting_session_id: int
) -> typing.Optional[base.HarvestedResourceInfo]:
resource_unique_identifier = harvestable_resource.unique_identifier
data = self._get_data()
result = None
try:
relevant_layer = [layer for layer in data["layers"] if layer["name"] == resource_unique_identifier][0]
except IndexError:
logger.exception(f"Could not find resource {resource_unique_identifier!r}")
else:
# WMS does not provide uuid, so needs to generated on the first time
# for update, use uuid from geonode resource
resource_uuid = uuid.uuid4()
if harvestable_resource.geonode_resource:
resource_uuid = uuid.UUID(harvestable_resource.geonode_resource.uuid)
# WMS does not provide the date of the resource.
# Use current time for the date stamp and resource time.
time = datetime.now()
contact = resourcedescriptor.RecordDescriptionContact(**data['contact'])
result = base.HarvestedResourceInfo(
resource_descriptor=resourcedescriptor.RecordDescription(
uuid=resource_uuid,
point_of_contact=contact,
author=contact,
date_stamp=time,
identification=resourcedescriptor.RecordIdentification(
name=relevant_layer['name'],
title=relevant_layer['title'],
date=time,
date_type='',
originator=contact,
graphic_overview_uri='',
place_keywords=[],
other_keywords=relevant_layer['keywords'],
license=[],
abstract=relevant_layer['abstract'],
spatial_extent=relevant_layer['spatial_extent']
),
distribution=resourcedescriptor.RecordDistribution(
legend_url=relevant_layer['legend_url'],
wms_url=relevant_layer['wms_url']
),
reference_systems=relevant_layer['crs'],
),
additional_information=None
)
return result

def _get_data(self) -> dict:
""" Return data from the harvester url in json format"""
params = self._base_wms_parameters.copy()
params.update({
"request": "GetCapabilities",
})
get_capabilities_response = self.http_session.get(
self.remote_url, params=params)
get_capabilities_response.raise_for_status()

root = etree.fromstring(get_capabilities_response.content, parser=XML_PARSER)
nsmap = _get_nsmap(root.nsmap)
useful_layers_elements = []

layers = []
leaf_layers = root.xpath("//wms:Layer[not(.//wms:Layer)]", namespaces=nsmap)
for layer_element in leaf_layers:
try:
title = layer_element.xpath("wms:Title/text()", namespaces=nsmap)[0]
except IndexError:
name = layer_element.xpath("wms:Name/text()", namespaces=nsmap)[0]
title = name
data = self._layer_element_to_json(layer_element)
title = data['title']
if self.layer_title_filter is not None:
if self.layer_title_filter.lower() not in title.lower():
continue
logger.debug(f"Creating resource descriptor for layer {title!r}...")
self.update_harvesting_session(
harvesting_session_id, total_records_found=len(useful_layers_elements))
self.finish_harvesting_session(harvesting_session_id)
layers.append(data)
return {
'contact': self._get_contact(root),
'layers': layers
}

def update_geonode_resource(
self,
harvested_info: base.HarvestedResourceInfo,
harvestable_resource: "HarvestableResource", # noqa
harvesting_session_id: int,
):
raise NotImplementedError
def _get_contact(self, element: etree.Element) -> dict:
"""Return contact from element"""
nsmap = _get_nsmap(
element.nsmap)
return {
'role': '',
'name': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactPersonPrimary/wms:ContactPerson", nsmap),
'organization': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactPersonPrimary/wms:ContactOrganization", nsmap),
'position': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactPosition", nsmap),
'phone_voice': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactVoiceTelephone", nsmap),
'phone_facsimile': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactFacsimileTelephone", nsmap),
'address_delivery_point': '',
'address_city': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactAddress/wms:City", nsmap),
'address_administrative_area': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactAddress/wms:StateOrProvince", nsmap),
'address_postal_code': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactAddress/wms:PostCode", nsmap),
'address_country': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactAddress/wms:Country", nsmap),
'address_email': get_xpath_value(
element, "wms:Service/wms:ContactInformation/wms:ContactElectronicMailAddress", nsmap),
}

def _get_useful_layers(self) -> typing.List[etree.Element]:
get_capabilities_response = self.http_session.get(
self.remote_url,
params={
"service": "WMS",
"version": "1.3.0",
"request": "GetCapabilities",
}
)
get_capabilities_response.raise_for_status()
root = etree.fromstring(get_capabilities_response.content, parser=XML_PARSER)
nsmap = _get_nsmap(root.nsmap)
useful_layers_elements = []
leaf_layers = root.xpath("//wms:Layer[not(.//wms:Layer)]", namespaces=nsmap)
for layer_element in leaf_layers:
title = layer_element.xpath("wms:Title/text()", namespaces=nsmap)[0]
if self.layer_title_filter is not None:
if self.layer_title_filter.lower() not in title.lower():
continue
useful_layers_elements.append(layer_element)
return useful_layers_elements
def _layer_element_to_json(self, layer_element: etree.Element) -> dict:
"""Return json of layer from xml element"""
nsmap = _get_nsmap(
layer_element.nsmap)
name = get_xpath_value(
layer_element, "wms:Name", nsmap)
title = get_xpath_value(
layer_element, "wms:Title", nsmap)
abstract = get_xpath_value(
layer_element, "wms:Abstract", nsmap)
try:
keywords = layer_element.xpath("wms:KeywordList/wms:Keyword/text()", namespaces=nsmap)
keywords = [str(keyword) for keyword in keywords]
except IndexError:
keywords = []

# legend_url link
try:
legend_url = layer_element.xpath(
"wms:Style/wms:LegendURL/wms:OnlineResource",
namespaces=nsmap
)[0].attrib['{' + layer_element.nsmap['xlink'] + '}href']
except (IndexError, KeyError):
legend_url = ''
params = self._base_wms_parameters
wms_url = self.remote_url + '?' + urlencode(params)

# spatial
csr = layer_element.xpath("wms:CRS//text()", namespaces=nsmap)[0]
try:
left_x = get_xpath_value(
layer_element, "wms:EX_GeographicBoundingBox/wms:westBoundLongitude", nsmap)
right_x = get_xpath_value(
layer_element, "wms:EX_GeographicBoundingBox/wms:eastBoundLongitude", nsmap)
lower_y = get_xpath_value(
layer_element, "wms:EX_GeographicBoundingBox/wms:southBoundLatitude", nsmap)
upper_y = get_xpath_value(
layer_element, "wms:EX_GeographicBoundingBox/wms:northBoundLatitude", nsmap)
# GeoNode seems to have a bug whereby sometimes the reported extent uses a
# comma as the decimal separator, other times it uses a dot
spatial_extent = geos.Polygon.from_bbox((
float(left_x.replace(",", ".")),
float(lower_y.replace(",", ".")),
float(right_x.replace(",", ".")),
float(upper_y.replace(",", ".")),
))
except IndexError:
spatial_extent = None
return {
'name': name,
'title': title,
'abstract': abstract,
'crs': csr,
'keywords': keywords,
'spatial_extent': spatial_extent,
'wms_url': wms_url,
'legend_url': legend_url,
}


def _get_nsmap(original: typing.Dict):
Expand Down
12 changes: 11 additions & 1 deletion geonode/harvesting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from django.utils.timezone import now
from django.utils.module_loading import import_string


# explicitly disable resolving XML entities in order to prevent malicious attacks
XML_PARSER: typing.Final = etree.XMLParser(resolve_entities=False)

Expand Down Expand Up @@ -53,3 +52,14 @@ def validate_worker_configuration(harvester_type, configuration: typing.Dict):
jsonschema.validate(configuration, schema)
except jsonschema.exceptions.SchemaError as exc:
raise RuntimeError(f"Invalid schema: {exc}")


def get_xpath_value(
element: etree.Element,
xpath_expression: str,
nsmap: typing.Optional[dict] = None
) -> typing.Optional[str]:
if not nsmap:
nsmap = element.nsmap
values = element.xpath(f"{xpath_expression}//text()", namespaces=nsmap)
return "".join(values).strip() or None

0 comments on commit 75ffa0e

Please sign in to comment.