Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: create collection TDE-453 #177

Merged
merged 6 commits into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions scripts/collection_from_items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import argparse
import json
import os

from boto3 import client
from linz_logger import get_log

from scripts.files.files_helper import is_json
from scripts.files.fs import read, write
from scripts.files.fs_s3 import bucket_name_from_path, prefix_from_path
from scripts.stac.imagery.collection import ImageryCollection


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--uri", dest="uri", help="s3 path to items and collection.json write location", required=True)
parser.add_argument("--collection_id", dest="collection_id", required=True)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)

arguments = parser.parse_args()

uri = arguments.uri
collection = ImageryCollection(
title=arguments.title, description=arguments.description, collection_id=arguments.collection_id
)

if not uri.startswith("s3://"):
msg = f"uri is not a s3 path: {uri}"
raise argparse.ArgumentTypeError(msg)

s3_client = client("s3")

paginator = s3_client.get_paginator("list_objects_v2")
response_iterator = paginator.paginate(Bucket=bucket_name_from_path(uri), Prefix=prefix_from_path(uri))
for response in response_iterator:
for contents_data in response["Contents"]:
key = contents_data["Key"]

file = os.path.join(f"s3://{bucket_name_from_path(uri)}", key)

if not is_json(file):
get_log().info("skipping file as not json", file=file, action="collection_from_items", reason="skip")
continue

item_stac = json.loads(read(file).decode("utf-8"))

if not arguments.collection_id == item_stac["collection"]:
get_log().info(
"skipping file as item.collection does not match collection_id",
file=file,
action="collection_from_items",
reason="skip",
)
continue

collection.add_item(item_stac)
get_log().info("item added to collection", item=item_stac["id"], file=file)

valid_item_count = [dictionary["rel"] for dictionary in collection.stac["links"]].count("item")
get_log().info("All valid items added to collection", valid_item_count=valid_item_count)

destination = os.path.join(uri, "collection.json")
write(destination, json.dumps(collection.stac).encode("utf-8"))
get_log().info("collection written", destination=destination)


if __name__ == "__main__":
main()
26 changes: 13 additions & 13 deletions scripts/create_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,41 +19,40 @@ def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--source", dest="source", nargs="+", required=True)
parser.add_argument("--collection_id", dest="collection_id", help="Unique id for collection", required=False)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)
parser.add_argument(
"--start_datetime", dest="start_datetime", help="start datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument(
"--end_datetime", dest="end_datetime", help="end datetime in format YYYY-MM-DD", type=valid_date, required=True
)
parser.add_argument("--title", dest="title", help="collection title", required=True)
parser.add_argument("--description", dest="description", help="collection description", required=True)

arguments = parser.parse_args()

source = format_source(arguments.source)
title = arguments.title
description = arguments.description
collection_id = arguments.collection_id
start_datetime = format_date(arguments.start_datetime)
end_datetime = format_date(arguments.end_datetime)

if arguments.collection_id:
collection = ImageryCollection(title=arguments.title, description=arguments.description, collection_id=arguments.ulid)
collection = ImageryCollection(title=title, description=description, collection_id=collection_id)
else:
collection = ImageryCollection(title=arguments.title, description=arguments.description)
collection = ImageryCollection(title=title, description=description)

for file in source:
if not is_tiff(file):
get_log().trace("file_not_tiff_skipped", file=file)
continue
gdalinfo_result = gdal_info(file)
item = create_item(
file,
format_date(arguments.start_datetime),
format_date(arguments.end_datetime),
arguments.collection_id,
gdalinfo_result,
)
item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result)
tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json")
write(tmp_file_path, json.dumps(item.stac).encode("utf-8"))
get_log().info("imagery_stac_item_created", file=file)
get_log().info("stac item written to tmp", location=tmp_file_path)

collection.add_item(item)
collection.add_item(item.stac)

tmp_file_path = os.path.join("/tmp/", "collection.json")
write(tmp_file_path, json.dumps(collection.stac).encode("utf-8"))
Expand All @@ -78,6 +77,7 @@ def create_item(
item.update_spatial(geometry, bbox)
item.add_collection(collection_id)

get_log().info("imagery stac item created", file=file)
return item


Expand Down
10 changes: 10 additions & 0 deletions scripts/files/fs_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,13 @@ def read(path: str, needs_credentials: bool = False) -> bytes:

get_log().debug("read_s3_success", path=path, duration=time_in_ms() - start_time)
return file


def bucket_name_from_path(path: str) -> str:
path_parts = path.replace("s3://", "").split("/")
return path_parts.pop(0)


def prefix_from_path(path: str) -> str:
bucket_name = bucket_name_from_path(path)
return path.replace(f"s3://{bucket_name}/", "")
14 changes: 6 additions & 8 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from datetime import datetime
from typing import TYPE_CHECKING, Any, Dict, List, Optional
from typing import Any, Dict, List, Optional

import ulid

from scripts.stac.util.STAC_VERSION import STAC_VERSION

if TYPE_CHECKING:
from scripts.stac.imagery.item import ImageryItem


class ImageryCollection:
stac: Dict[str, Any]
Expand All @@ -26,12 +23,13 @@ def __init__(self, title: str, description: str, collection_id: Optional[str] =
"links": [{"rel": "self", "href": "./collection.json", "type": "application/json"}],
}

def add_item(self, item: "ImageryItem") -> None:
item_self_link = next((feat for feat in item.stac["links"] if feat["rel"] == "self"), None)
def add_item(self, item: Dict[Any, Any]) -> None:

item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None)
if item_self_link:
self.add_link(href=item_self_link["href"])
self.update_temporal_extent(item.stac["properties"]["start_datetime"], item.stac["properties"]["end_datetime"])
self.update_spatial_extent(item.stac["bbox"])
self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"])
self.update_spatial_extent(item["bbox"])

def add_link(self, href: str, rel: str = "item", file_type: str = "application/json") -> None:
self.stac["links"].append({"rel": rel, "href": href, "type": file_type})
Expand Down
2 changes: 1 addition & 1 deletion scripts/stac/tests/collection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_add_item(mocker) -> None: # type: ignore
end_datetime = "2021-01-27 00:00:00Z"
item.update_spatial(geometry, bbox)
item.update_datetime(start_datetime, end_datetime)
collection.add_item(item)
collection.add_item(item.stac)

assert {"rel": "item", "href": f"./{id_}.json", "type": "application/json"} in collection.stac["links"]
assert collection.stac["extent"]["temporal"]["interval"] == [[start_datetime, end_datetime]]
Expand Down
14 changes: 6 additions & 8 deletions scripts/standardise_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def main() -> None:
arguments = parser.parse_args()

source = format_source(arguments.source)
start_datetime = format_date(arguments.start_datetime)
end_datetime = format_date(arguments.end_datetime)
collection_id = arguments.collection_id

if is_argo():
concurrency = 4
Expand All @@ -47,16 +50,11 @@ def main() -> None:
continue
gdalinfo_result = gdal_info(file)
qa_file(file, srs, gdalinfo_result)
item = create_item(
file,
format_date(arguments.start_datetime),
format_date(arguments.end_datetime),
arguments.collection_id,
gdalinfo_result,
)
item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result)

tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json")
write(tmp_file_path, json.dumps(item.stac).encode("utf-8"))
get_log().info("imagery_stac_item_created", file=file)
get_log().info("stac item written to tmp", location=tmp_file_path)


if __name__ == "__main__":
Expand Down