Skip to content

Commit

Permalink
Feat: create stac items TDE-452 (#133)
Browse files Browse the repository at this point in the history
* feat: initialise collection object and stac

* test: collection test

* fix: appease mypy

* fix: move stac version to its own file for reuse

* feat: create items

* fix: add dependencies

* fix: minor code tidy and add test

* fix: formatting

Co-authored-by: Alice Fage <[email protected]>
  • Loading branch information
MDavidson17 and amfage authored Sep 23, 2022
1 parent 8c542d5 commit 47e966a
Show file tree
Hide file tree
Showing 11 changed files with 707 additions and 81 deletions.
491 changes: 425 additions & 66 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ python = "^3.8.10"
boto3 = "^1.24.12"
linz-logger = "^0.9.0"
certifi = "^2022.6.15"
py-multihash = "^2.0.1"

[tool.poetry.dev-dependencies]
black = "^22.3.0"
Expand All @@ -52,3 +53,4 @@ mypy-boto3-s3 = "^1.24.0"
pytest = "^7.1.2"
pytest-dependency = "^0.5.1"
moto = "^3.1.16"
pytest-mock = "^3.8.2"
41 changes: 38 additions & 3 deletions scripts/cli/cli_helper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import argparse
import json
from datetime import datetime
from os import environ
from typing import List

from dateutil import parser, tz
from linz_logger import get_log


Expand All @@ -26,12 +28,45 @@ def parse_source() -> List[str]:
Returns:
List[str]: A list of paths.
"""
parser = argparse.ArgumentParser()
parser.add_argument("--source", dest="source", nargs="+", required=True)
arguments = parser.parse_args()
parser_args = argparse.ArgumentParser()
parser_args.add_argument("--source", dest="source", nargs="+", required=True)
arguments = parser_args.parse_args()

return format_source(arguments.source)


def is_argo() -> bool:
return bool(environ.get("ARGO_TEMPLATE"))


def format_date(date: datetime) -> str:
"""Parse the CLI argument '--date' and format it to UTC.
Args:
date: datetime
Returns:
str: date and time in UTC
"""
date_string_nz = f"{date.strftime('%Y-%m-%d')}T00:00:00.000"
datetime_utc = nzt_datetime_to_utc_datetime(date_string_nz)
return datetime_utc.strftime("%Y-%m-%dT%H:%M:%S") + "Z"


def nzt_datetime_to_utc_datetime(date: str) -> datetime:
utc_tz = tz.gettz("UTC")
nz_tz = tz.gettz("Pacific/Auckland")

try:
nz_time = parser.parse(date).replace(tzinfo=nz_tz)
except parser.ParserError as err:
raise Exception(f"Not a valid date: {err}") from err

utc_time: datetime = nz_time.astimezone(utc_tz)
return utc_time


def valid_date(s: str) -> datetime:
try:
return datetime.strptime(s, "%Y-%m-%d")
except ValueError as e:
msg = f"not a valid date: {s}"
raise argparse.ArgumentTypeError(msg) from e
66 changes: 66 additions & 0 deletions scripts/create_stac_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import argparse
import json
import os
from typing import List

from linz_logger import get_log

from scripts.cli.cli_helper import format_date, format_source, valid_date
from scripts.files.files_helper import get_file_name_from_path, is_tiff
from scripts.files.fs import read, write
from scripts.logging.time_helper import time_in_ms
from scripts.stac.imagery.collection import ImageryCollection
from scripts.stac.imagery.item import ImageryItem
from scripts.stac.util.geotiff import get_extents


def create_imagery_items(files: List[str], start_datetime: str, end_datetime: str, collection_path: str) -> None:
start_time = time_in_ms()

get_log().info("read collection object", source=collection_path)
collection = ImageryCollection(stac=json.loads(read(collection_path)))

get_log().info("create_stac_items_imagery_start", source=files)

for file in files:
if not is_tiff(file):
get_log().trace("create_stac_file_not_tiff_skipped", file=file)
continue

id_ = get_file_name_from_path(file)
geometry, bbox = get_extents(file)

item = ImageryItem(id_, file)
item.update_datetime(start_datetime, end_datetime)
item.update_spatial(geometry, bbox)
item.add_collection(collection, collection_path)

tmp_file_path = os.path.join("/tmp/", f"{id_}.json")
write(tmp_file_path, json.dumps(item.stac).encode("utf-8"))
get_log().info("imagery_stac_item_created", file=file)

get_log().info("create_stac_items_imagery_complete", source=files, duration=time_in_ms() - start_time)


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--source", dest="source", nargs="+", required=True)
parser.add_argument(
"--start_datetime", dest="start_datetime", help="start datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument(
"--end_datetime", dest="end_datetime", help="end datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument("--collection", dest="collection", help="path to collection.json", required=True)
arguments = parser.parse_args()

source = format_source(arguments.source)
start_datetime = format_date(arguments.start_datetime)
end_datetime = format_date(arguments.end_datetime)
collection_path = arguments.collection

create_imagery_items(source, start_datetime, end_datetime, collection_path)


if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions scripts/gdal/gdalinfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import json
from typing import Any, Dict

from linz_logger import get_log

from scripts.gdal.gdal_helper import GDALExecutionException, run_gdal


def gdal_info(path: str) -> Dict[Any, Any]:
gdalinfo_command = ["gdalinfo", "-stats", "-json", "--config", "GDAL_PAM_ENABLED", "NO"]
try:
gdalinfo_process = run_gdal(gdalinfo_command, path)
gdalinfo_result = {}
try:
gdalinfo_result = json.loads(gdalinfo_process.stdout)
except json.JSONDecodeError as e:
get_log().error("load_gdalinfo_result_error", file=path, error=e)
raise e
if gdalinfo_process.stderr:
get_log().error("Gdalinfo_error", file=path, error=str(gdalinfo_process.stderr))
raise Exception(f"Gdalinfo Error {str(gdalinfo_process.stderr)}")
return gdalinfo_result
except GDALExecutionException as gee:
get_log().error("gdalinfo_failed", file=path, error=str(gee))
raise gee
29 changes: 17 additions & 12 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,27 @@

from scripts.stac.util.STAC_VERSION import STAC_VERSION

PYSTAC_VERSION = "1.0.0"


class ImageryCollection:
stac: Dict[str, Any]

def __init__(self, title: Optional[str] = None, description: Optional[str] = None) -> None:
self.stac = {
"type": "Collection",
"stac_version": STAC_VERSION,
"id": str(ulid.ULID()),
"title": title,
"description": description,
"license": "CC-BY-4.0",
"links": [{"rel": "self", "href": "./collection.json", "type": "application/json"}],
}
def __init__(
self, title: Optional[str] = None, description: Optional[str] = None, stac: Optional[Dict[str, Any]] = None
) -> None:
if stac:
self.stac = stac
elif title and description:
self.stac = {
"type": "Collection",
"stac_version": STAC_VERSION,
"id": str(ulid.ULID()),
"title": title,
"description": description,
"license": "CC-BY-4.0",
"links": [{"rel": "self", "href": "./collection.json", "type": "application/json"}],
}
else:
raise Exception("incorrect initialising parameters must have 'stac' or 'title and description'")

def add_link(self, href: str, rel: str = "item", file_type: str = "application/json") -> None:
# Will be implemented in Future PR
Expand Down
52 changes: 52 additions & 0 deletions scripts/stac/imagery/item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Any, Dict, List, Optional

from scripts.stac.imagery.collection import ImageryCollection
from scripts.stac.util import checksum
from scripts.stac.util.STAC_VERSION import STAC_VERSION
from scripts.stac.util.stac_extensions import StacExtensions


class ImageryItem:
stac: Dict[str, Any]

def __init__(self, id_: Optional[str] = None, path: Optional[str] = None, stac: Optional[Dict[str, Any]] = None) -> None:
if stac:
self.stac = stac
elif id_ and path:
self.stac = {
"type": "Feature",
"stac_version": STAC_VERSION,
"id": id_,
"links": [
{"rel": "self", "href": f"./{id_}.json", "type": "application/json"},
],
"assets": {
"visual": {
"href": path,
"type": "image/tiff; application:geotiff; profile:cloud-optimized",
"file:checksum": checksum.multihash_as_hex(path),
}
},
"stac_extensions": [StacExtensions.file.value],
}
else:
raise Exception("incorrect initialising parameters must have 'stac' or 'id_ and path'")

def update_datetime(self, start_datetime: str, end_datetime: str) -> None:
self.stac["properties"] = {
"start_datetime": start_datetime,
"end_datetime": end_datetime,
"datetime": None,
}

def update_spatial(self, geometry: List[List[float]], bbox: List[float]) -> None:
self.stac["geometry"] = {"type": "Polygon", "coordinates": [geometry]}
self.stac["bbox"] = bbox

def add_collection(self, collection: ImageryCollection, path: str) -> None:
self.stac["collection"] = collection.stac["title"]
self.add_link(rel="collection", href=path)
self.add_link(rel="parent", href=path)

def add_link(self, rel: str, href: str, file_type: str = "application/json") -> None:
self.stac["links"].append({"rel": rel, "href": href, "type": file_type})
44 changes: 44 additions & 0 deletions scripts/stac/tests/item_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from scripts.files.files_helper import get_file_name_from_path
from scripts.stac.imagery.item import ImageryCollection, ImageryItem


def test_imagery_stac_item(mocker) -> None: # type: ignore
# mock functions that interact with files
geometry = [[1799667.5, 5815977.0], [1800422.5, 5815977.0], [1800422.5, 5814986.0], [1799667.5, 5814986.0]]
bbox = [1799667.5, 5815977.0, 1800422.5, 5814986.0]
checksum = "1220cdef68d62fb912110b810e62edc53de07f7a44fb2b310db700e9d9dd58baa6b4"
mocker.patch("scripts.stac.util.checksum.multihash_as_hex", return_value=checksum)

path = "./test/BR34_5000_0302.tiff"
id_ = get_file_name_from_path(path)
start_datetime = "2021-01-27 00:00:00Z"
end_datetime = "2021-01-27 00:00:00Z"

item = ImageryItem(id_, path)
item.update_spatial(geometry, bbox)
item.update_datetime(start_datetime, end_datetime)
# checks
assert item.stac["id"] == id_
assert item.stac["properties"]["start_datetime"] == start_datetime
assert item.stac["properties"]["end_datetime"] == end_datetime
assert item.stac["properties"]["datetime"] is None
assert item.stac["geometry"]["coordinates"] == [geometry]
assert item.stac["bbox"] == bbox
assert item.stac["assets"]["visual"]["file:checksum"] == checksum


def test_imagery_add_collection(mocker) -> None: # type: ignore
title = "Collection"
description = "Collection Description"
collection = ImageryCollection(title=title, description=description)

path = "./test/BR34_5000_0302.tiff"
id_ = get_file_name_from_path(path)
checksum = "1220cdef68d62fb912110b810e62edc53de07f7a44fb2b310db700e9d9dd58baa6b4"
mocker.patch("scripts.stac.util.checksum.multihash_as_hex", return_value=checksum)
item = ImageryItem(id_, path)

item.add_collection(collection, "fake/path.json")

assert item.stac["collection"] == "Collection"
assert {"rel": "collection", "href": "fake/path.json", "type": "application/json"} in item.stac["links"]
17 changes: 17 additions & 0 deletions scripts/stac/util/checksum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import hashlib
import io

import multihash

from scripts.files import fs

CHUNK_SIZE = 1024 * 1024 # 1MB


def multihash_as_hex(path: str) -> str:
file_hash = hashlib.sha256()
file = io.BytesIO(fs.read(path))
while chunk := file.read(CHUNK_SIZE):
file_hash.update(chunk)
result: str = multihash.to_hex_string(multihash.encode(file_hash.digest(), "sha2-256"))
return result
16 changes: 16 additions & 0 deletions scripts/stac/util/geotiff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List, Tuple

from scripts.gdal.gdalinfo import gdal_info


def get_extents(path: str) -> Tuple[List[List[float]], List[float]]:
corner_coordinates = gdal_info(path)["cornerCoordinates"]

upper_left = [corner_coordinates["upperLeft"][0], corner_coordinates["upperLeft"][1]]
upper_right = [corner_coordinates["upperRight"][0], corner_coordinates["upperRight"][1]]
lower_left = [corner_coordinates["lowerLeft"][0], corner_coordinates["lowerLeft"][1]]
lower_right = [corner_coordinates["lowerRight"][0], corner_coordinates["lowerRight"][1]]

geometry = [upper_left, upper_right, lower_right, lower_left]
bbox = [upper_left[0], upper_left[1], lower_right[0], lower_right[1]]
return geometry, bbox
5 changes: 5 additions & 0 deletions scripts/stac/util/stac_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from enum import Enum


class StacExtensions(str, Enum):
file = "https://stac-extensions.github.io/file/v2.0.0/schema.json"

0 comments on commit 47e966a

Please sign in to comment.