From cde1d8c9a00e12df0583b5be39b78a8e82212ce6 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 12 Jul 2022 15:46:52 +1200 Subject: [PATCH 1/7] feat: create polygon script return temporary file path --- scripts/create_polygons.py | 94 ++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 50 deletions(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index ba15a11e8..ece35728c 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -4,25 +4,12 @@ from collections import Counter from urllib.parse import urlparse -from aws_helper import get_bucket, get_bucket_name_from_path +from aws_helper import get_bucket from linz_logger import get_log # osgeo is embbed in the Docker image from osgeo import gdal # pylint: disable=import-error -logger = get_log() - -parser = argparse.ArgumentParser() -parser.add_argument("--uri", dest="uri", required=True) -parser.add_argument("--destination", dest="destination", required=True) -arguments = parser.parse_args() -uri = arguments.uri -destination = arguments.destination - -# Split the s3 destination path -destination_bucket_name = get_bucket_name_from_path(destination) -destination_path = destination.replace("s3://", "").replace(f"{destination_bucket_name}/", "") - def create_mask(file_path: str, mask_dst: str) -> None: set_srs_command = f'gdal_edit.py -a_srs EPSG:2193 "{file_path}"' @@ -50,42 +37,49 @@ def get_pixel_count(file_path: str) -> int: return data_pixels_count -with tempfile.TemporaryDirectory() as tmp_dir: - source_file_name = os.path.basename(uri) - # Download the file - if str(uri).startswith("s3://"): - uri_parse = urlparse(uri, allow_fragments=False) - bucket_name = uri_parse.netloc - bucket = get_bucket(bucket_name) - uri = os.path.join(tmp_dir, "temp.tif") - logger.debug( - "download_file", source=uri_parse.path[1:], bucket=bucket_name, destination=uri, sourceFileName=source_file_name - ) - bucket.download_file(uri_parse.path[1:], uri) +def main() -> None: + logger = get_log() + + parser = argparse.ArgumentParser() + parser.add_argument("--source", dest="source", required=True) + arguments = parser.parse_args() + source = arguments.source + + with tempfile.TemporaryDirectory() as tmp_dir: + source_file_name = os.path.basename(source) + # Download the file + if str(source).startswith("s3://"): + uri_parse = urlparse(source, allow_fragments=False) + bucket_name = uri_parse.netloc + bucket = get_bucket(bucket_name) + source = os.path.join(tmp_dir, "temp.tif") + logger.debug( + "download_file", + source=uri_parse.path[1:], + bucket=bucket_name, + destination=source, + sourceFileName=source_file_name, + ) + bucket.download_file(uri_parse.path[1:], source) + + # Run create_mask + logger.debug("create_mask", source=uri_parse.path[1:], bucket=bucket_name, destination=source) + mask_file = os.path.join(tmp_dir, "mask.tif") + create_mask(source, mask_file) + + # Run create_polygon + data_px_count = get_pixel_count(mask_file) + if data_px_count == 0: + # exclude extents if tif is all white or black + logger.debug(f"- data_px_count was zero in create_mask function for the tif {mask_file}") + else: + destination_file_name = os.path.splitext(source_file_name)[0] + ".geojson" + temp_file_path = os.path.join(tmp_dir, destination_file_name) + polygonize_command = f'gdal_polygonize.py -q "{mask_file}" "{temp_file_path}" -f GeoJSON' + os.system(polygonize_command) - # Run create_mask - logger.debug("create_mask", source=uri_parse.path[1:], bucket=bucket_name, destination=uri) - mask_file = os.path.join(tmp_dir, "mask.tif") - create_mask(uri, mask_file) + return temp_file_path - # Run create_polygon - data_px_count = get_pixel_count(mask_file) - if data_px_count == 0: - # exclude extents if tif is all white or black - logger.debug(f"- data_px_count was zero in create_mask function for the tif {mask_file}") - else: - destination_file_name = os.path.splitext(source_file_name)[0] + ".geojson" - temp_file_path = os.path.join(tmp_dir, destination_file_name) - polygonize_command = f'gdal_polygonize.py -q "{mask_file}" "{temp_file_path}" -f GeoJSON' - os.system(polygonize_command) - # Upload shape file - destination_bucket = get_bucket(destination_bucket_name) - destination_file_path = os.path.join(destination_path, destination_file_name) - logger.debug("upload_start", destinationBucket=destination_bucket_name, destinationFile=destination_file_path) - try: - destination_bucket.upload_file(temp_file_path, destination_file_path) - except Exception as e: - logger.debug("upload_error", err=e) - raise e - logger.debug("upload_end", destinationBucket=destination_bucket_name, destinationFile=destination_file_path) +if __name__ == "__main__": + main() From ee8f4483ae18b8df1556400dbba3f22c57caf8ea Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 12 Jul 2022 15:50:02 +1200 Subject: [PATCH 2/7] fix: main return value and formatting --- scripts/create_polygons.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index ece35728c..833838926 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -37,7 +37,7 @@ def get_pixel_count(file_path: str) -> int: return data_pixels_count -def main() -> None: +def main() -> str: logger = get_log() parser = argparse.ArgumentParser() @@ -78,7 +78,7 @@ def main() -> None: polygonize_command = f'gdal_polygonize.py -q "{mask_file}" "{temp_file_path}" -f GeoJSON' os.system(polygonize_command) - return temp_file_path + return temp_file_path if __name__ == "__main__": From ebd2bc3ca179792d194a3335d2cff00cbe7b87b4 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Wed, 13 Jul 2022 08:49:22 +1200 Subject: [PATCH 3/7] feat: Allow Create Polygons script to run with multiple files --- scripts/create_polygons.py | 78 +++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index 833838926..d76ac99d7 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -2,9 +2,11 @@ import os import tempfile from collections import Counter +from typing import List from urllib.parse import urlparse from aws_helper import get_bucket +from format_source import format_source from linz_logger import get_log # osgeo is embbed in the Docker image @@ -37,7 +39,7 @@ def get_pixel_count(file_path: str) -> int: return data_pixels_count -def main() -> str: +def main() -> List[str]: # pylint: disable=too-many-locals logger = get_log() parser = argparse.ArgumentParser() @@ -45,40 +47,46 @@ def main() -> str: arguments = parser.parse_args() source = arguments.source - with tempfile.TemporaryDirectory() as tmp_dir: - source_file_name = os.path.basename(source) - # Download the file - if str(source).startswith("s3://"): - uri_parse = urlparse(source, allow_fragments=False) - bucket_name = uri_parse.netloc - bucket = get_bucket(bucket_name) - source = os.path.join(tmp_dir, "temp.tif") - logger.debug( - "download_file", - source=uri_parse.path[1:], - bucket=bucket_name, - destination=source, - sourceFileName=source_file_name, - ) - bucket.download_file(uri_parse.path[1:], source) - - # Run create_mask - logger.debug("create_mask", source=uri_parse.path[1:], bucket=bucket_name, destination=source) - mask_file = os.path.join(tmp_dir, "mask.tif") - create_mask(source, mask_file) - - # Run create_polygon - data_px_count = get_pixel_count(mask_file) - if data_px_count == 0: - # exclude extents if tif is all white or black - logger.debug(f"- data_px_count was zero in create_mask function for the tif {mask_file}") - else: - destination_file_name = os.path.splitext(source_file_name)[0] + ".geojson" - temp_file_path = os.path.join(tmp_dir, destination_file_name) - polygonize_command = f'gdal_polygonize.py -q "{mask_file}" "{temp_file_path}" -f GeoJSON' - os.system(polygonize_command) - - return temp_file_path + source = format_source(source) + output_files = [] + + for file in source: + with tempfile.TemporaryDirectory() as tmp_dir: + source_file_name = os.path.basename(file) + # Download the file + if str(file).startswith("s3://"): + uri_parse = urlparse(file, allow_fragments=False) + bucket_name = uri_parse.netloc + bucket = get_bucket(bucket_name) + file = os.path.join(tmp_dir, "temp.tif") + logger.debug( + "download_file", + source=uri_parse.path[1:], + bucket=bucket_name, + destination=file, + sourceFileName=source_file_name, + ) + bucket.download_file(uri_parse.path[1:], file) + + # Run create_mask + logger.debug("create_mask", source=uri_parse.path[1:], bucket=bucket_name, destination=file) + mask_file = os.path.join(tmp_dir, "mask.tif") + create_mask(file, mask_file) + + # Run create_polygon + data_px_count = get_pixel_count(mask_file) + if data_px_count == 0: + # exclude extents if tif is all white or black + logger.debug(f"- data_px_count was zero in create_mask function for the tif {mask_file}") + else: + destination_file_name = os.path.splitext(source_file_name)[0] + ".geojson" + temp_file_path = os.path.join(tmp_dir, destination_file_name) + polygonize_command = f'gdal_polygonize.py -q "{mask_file}" "{temp_file_path}" -f GeoJSON' + os.system(polygonize_command) + + output_files.append(temp_file_path) + + return output_files if __name__ == "__main__": From 8c0aedd2b607c1823165aa9ae54dd16b3994adb0 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Wed, 13 Jul 2022 14:48:40 +1200 Subject: [PATCH 4/7] fix: argument source was not correctly set --- scripts/create_polygons.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index d76ac99d7..31eb88534 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -43,7 +43,7 @@ def main() -> List[str]: # pylint: disable=too-many-locals logger = get_log() parser = argparse.ArgumentParser() - parser.add_argument("--source", dest="source", required=True) + parser.add_argument("--source", dest="source", nargs="+", required=True) arguments = parser.parse_args() source = arguments.source @@ -52,7 +52,9 @@ def main() -> List[str]: # pylint: disable=too-many-locals for file in source: with tempfile.TemporaryDirectory() as tmp_dir: + print(file) source_file_name = os.path.basename(file) + uri_parse = file # Download the file if str(file).startswith("s3://"): uri_parse = urlparse(file, allow_fragments=False) From c3caa70b836f6d719f2cfa5aa95588e370bb4bcf Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Wed, 13 Jul 2022 15:58:02 +1200 Subject: [PATCH 5/7] feat: write processed file paths in tmp/file_list.json --- scripts/create_polygons.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index dfb66d6ea..fac3cf5f2 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -1,5 +1,7 @@ import argparse +import json import os +import subprocess import tempfile from collections import Counter from typing import List @@ -39,7 +41,7 @@ def get_pixel_count(file_path: str) -> int: return data_pixels_count -def main() -> List[str]: # pylint: disable=too-many-locals +def main() -> None: # pylint: disable=too-many-locals logger = get_log() parser = argparse.ArgumentParser() @@ -87,7 +89,8 @@ def main() -> List[str]: # pylint: disable=too-many-locals output_files.append(temp_file_path) - return output_files + with open("/tmp/file_list.json", "w") as jf: + json.dump(output_files, jf) if __name__ == "__main__": From 8130391e9f7fcab086530cb7bbec3ca57fc5ad19 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Wed, 13 Jul 2022 16:00:45 +1200 Subject: [PATCH 6/7] fix: remove unused imports --- scripts/create_polygons.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index fac3cf5f2..3eb200a39 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -1,10 +1,8 @@ import argparse import json import os -import subprocess import tempfile from collections import Counter -from typing import List from urllib.parse import urlparse from aws_helper import get_bucket From 574372d37cf2652efef6d84f3a298d127e437a8b Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Wed, 13 Jul 2022 16:04:13 +1200 Subject: [PATCH 7/7] fix: pylint complaining about encoding --- scripts/create_polygons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_polygons.py b/scripts/create_polygons.py index 3eb200a39..32eb21ef6 100644 --- a/scripts/create_polygons.py +++ b/scripts/create_polygons.py @@ -87,7 +87,7 @@ def main() -> None: # pylint: disable=too-many-locals output_files.append(temp_file_path) - with open("/tmp/file_list.json", "w") as jf: + with open("/tmp/file_list.json", "w", encoding="utf-8") as jf: json.dump(output_files, jf)