diff --git a/scripts/collection_from_items.py b/scripts/collection_from_items.py new file mode 100644 index 000000000..59f0d33a1 --- /dev/null +++ b/scripts/collection_from_items.py @@ -0,0 +1,69 @@ +import argparse +import json +import os + +from boto3 import client +from linz_logger import get_log + +from scripts.files.files_helper import is_json +from scripts.files.fs import read, write +from scripts.files.fs_s3 import bucket_name_from_path, prefix_from_path +from scripts.stac.imagery.collection import ImageryCollection + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--uri", dest="uri", help="s3 path to items and collection.json write location", required=True) + parser.add_argument("--collection_id", dest="collection_id", required=True) + parser.add_argument("--title", dest="title", help="collection title", required=True) + parser.add_argument("--description", dest="description", help="collection description", required=True) + + arguments = parser.parse_args() + + uri = arguments.uri + collection = ImageryCollection( + title=arguments.title, description=arguments.description, collection_id=arguments.collection_id + ) + + if not uri.startswith("s3://"): + msg = f"uri is not a s3 path: {uri}" + raise argparse.ArgumentTypeError(msg) + + s3_client = client("s3") + + paginator = s3_client.get_paginator("list_objects_v2") + response_iterator = paginator.paginate(Bucket=bucket_name_from_path(uri), Prefix=prefix_from_path(uri)) + for response in response_iterator: + for contents_data in response["Contents"]: + key = contents_data["Key"] + + file = os.path.join(f"s3://{bucket_name_from_path(uri)}", key) + + if not is_json(file): + get_log().info("skipping file as not json", file=file, action="collection_from_items", reason="skip") + continue + + item_stac = json.loads(read(file).decode("utf-8")) + + if not arguments.collection_id == item_stac["collection"]: + get_log().info( + "skipping file as item.collection does not match collection_id", + file=file, + action="collection_from_items", + reason="skip", + ) + continue + + collection.add_item(item_stac) + get_log().info("item added to collection", item=item_stac["id"], file=file) + + valid_item_count = [dictionary["rel"] for dictionary in collection.stac["links"]].count("item") + get_log().info("All valid items added to collection", valid_item_count=valid_item_count) + + destination = os.path.join(uri, "collection.json") + write(destination, json.dumps(collection.stac).encode("utf-8")) + get_log().info("collection written", destination=destination) + + +if __name__ == "__main__": + main() diff --git a/scripts/create_stac.py b/scripts/create_stac.py index 618dcf228..a623d6cef 100644 --- a/scripts/create_stac.py +++ b/scripts/create_stac.py @@ -19,41 +19,40 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--source", dest="source", nargs="+", required=True) parser.add_argument("--collection_id", dest="collection_id", help="Unique id for collection", required=False) - parser.add_argument("--title", dest="title", help="collection title", required=True) - parser.add_argument("--description", dest="description", help="collection description", required=True) parser.add_argument( "--start_datetime", dest="start_datetime", help="start datetime in format YYYY-MM-DD", type=valid_date, required=True ) parser.add_argument( "--end_datetime", dest="end_datetime", help="end datetime in format YYYY-MM-DD", type=valid_date, required=True ) + parser.add_argument("--title", dest="title", help="collection title", required=True) + parser.add_argument("--description", dest="description", help="collection description", required=True) arguments = parser.parse_args() source = format_source(arguments.source) + title = arguments.title + description = arguments.description + collection_id = arguments.collection_id + start_datetime = format_date(arguments.start_datetime) + end_datetime = format_date(arguments.end_datetime) if arguments.collection_id: - collection = ImageryCollection(title=arguments.title, description=arguments.description, collection_id=arguments.ulid) + collection = ImageryCollection(title=title, description=description, collection_id=collection_id) else: - collection = ImageryCollection(title=arguments.title, description=arguments.description) + collection = ImageryCollection(title=title, description=description) for file in source: if not is_tiff(file): get_log().trace("file_not_tiff_skipped", file=file) continue gdalinfo_result = gdal_info(file) - item = create_item( - file, - format_date(arguments.start_datetime), - format_date(arguments.end_datetime), - arguments.collection_id, - gdalinfo_result, - ) + item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result) tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json") write(tmp_file_path, json.dumps(item.stac).encode("utf-8")) - get_log().info("imagery_stac_item_created", file=file) + get_log().info("stac item written to tmp", location=tmp_file_path) - collection.add_item(item) + collection.add_item(item.stac) tmp_file_path = os.path.join("/tmp/", "collection.json") write(tmp_file_path, json.dumps(collection.stac).encode("utf-8")) @@ -78,6 +77,7 @@ def create_item( item.update_spatial(geometry, bbox) item.add_collection(collection_id) + get_log().info("imagery stac item created", file=file) return item diff --git a/scripts/files/fs_s3.py b/scripts/files/fs_s3.py index 47dff2ead..b6ed71107 100644 --- a/scripts/files/fs_s3.py +++ b/scripts/files/fs_s3.py @@ -65,3 +65,13 @@ def read(path: str, needs_credentials: bool = False) -> bytes: get_log().debug("read_s3_success", path=path, duration=time_in_ms() - start_time) return file + + +def bucket_name_from_path(path: str) -> str: + path_parts = path.replace("s3://", "").split("/") + return path_parts.pop(0) + + +def prefix_from_path(path: str) -> str: + bucket_name = bucket_name_from_path(path) + return path.replace(f"s3://{bucket_name}/", "") diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index 36d05b3d1..405ce24b1 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -1,13 +1,10 @@ from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import Any, Dict, List, Optional import ulid from scripts.stac.util.STAC_VERSION import STAC_VERSION -if TYPE_CHECKING: - from scripts.stac.imagery.item import ImageryItem - class ImageryCollection: stac: Dict[str, Any] @@ -26,12 +23,13 @@ def __init__(self, title: str, description: str, collection_id: Optional[str] = "links": [{"rel": "self", "href": "./collection.json", "type": "application/json"}], } - def add_item(self, item: "ImageryItem") -> None: - item_self_link = next((feat for feat in item.stac["links"] if feat["rel"] == "self"), None) + def add_item(self, item: Dict[Any, Any]) -> None: + + item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None) if item_self_link: self.add_link(href=item_self_link["href"]) - self.update_temporal_extent(item.stac["properties"]["start_datetime"], item.stac["properties"]["end_datetime"]) - self.update_spatial_extent(item.stac["bbox"]) + self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"]) + self.update_spatial_extent(item["bbox"]) def add_link(self, href: str, rel: str = "item", file_type: str = "application/json") -> None: self.stac["links"].append({"rel": rel, "href": href, "type": file_type}) diff --git a/scripts/stac/tests/collection_test.py b/scripts/stac/tests/collection_test.py index 67e1b0280..8ea0e225a 100644 --- a/scripts/stac/tests/collection_test.py +++ b/scripts/stac/tests/collection_test.py @@ -65,7 +65,7 @@ def test_add_item(mocker) -> None: # type: ignore end_datetime = "2021-01-27 00:00:00Z" item.update_spatial(geometry, bbox) item.update_datetime(start_datetime, end_datetime) - collection.add_item(item) + collection.add_item(item.stac) assert {"rel": "item", "href": f"./{id_}.json", "type": "application/json"} in collection.stac["links"] assert collection.stac["extent"]["temporal"]["interval"] == [[start_datetime, end_datetime]] diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 4a911b823..7029b267a 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -31,6 +31,9 @@ def main() -> None: arguments = parser.parse_args() source = format_source(arguments.source) + start_datetime = format_date(arguments.start_datetime) + end_datetime = format_date(arguments.end_datetime) + collection_id = arguments.collection_id if is_argo(): concurrency = 4 @@ -47,16 +50,11 @@ def main() -> None: continue gdalinfo_result = gdal_info(file) qa_file(file, srs, gdalinfo_result) - item = create_item( - file, - format_date(arguments.start_datetime), - format_date(arguments.end_datetime), - arguments.collection_id, - gdalinfo_result, - ) + item = create_item(file, start_datetime, end_datetime, collection_id, gdalinfo_result) + tmp_file_path = os.path.join("/tmp/", f"{item.stac['id']}.json") write(tmp_file_path, json.dumps(item.stac).encode("utf-8")) - get_log().info("imagery_stac_item_created", file=file) + get_log().info("stac item written to tmp", location=tmp_file_path) if __name__ == "__main__":