--prune should leave archived S3 keys (#3266)

* --prune should leave archived S3 keys Part of #2224 * Update deployer/src/deployer/upload.py Co-authored-by: Ryan Johnson <[email protected]> * Update deployer/src/deployer/upload.py Co-authored-by: Ryan Johnson <[email protected]> * parse_archived_txt_file as a generator * only one 1 set * add a nice warning about --prune without --archived-files Co-authored-by: Ryan Johnson <[email protected]>
mdn · Mar 18, 2021 · 0987df8 · 0987df8
1 parent 1d3f9e7
commit 0987df8
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 2 deletions.
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
@@ -204,7 +204,7 @@ jobs:
           # XXX would be nice to validate here that $DEPLOYER_BUCKET_PREFIX is truthy
           echo "DEPLOYER_BUCKET_PREFIX=$DEPLOYER_BUCKET_PREFIX"
 
-          poetry run deployer upload --prune ../client/build
+          poetry run deployer upload --prune --archived-files ../content/archived.txt ../client/build
           poetry run deployer update-lambda-functions ./aws-lambda
           # TODO
           # Execute command to tell the Dev CloudFront distribution to use the

diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml
@@ -240,7 +240,7 @@ jobs:
           # XXX would be nice to validate here that $DEPLOYER_BUCKET_PREFIX is truthy
           echo "DEPLOYER_BUCKET_PREFIX=$DEPLOYER_BUCKET_PREFIX"
 
-          poetry run deployer upload  --prune ../client/build
+          poetry run deployer upload  --prune --archived-files ../content/archived.txt ../client/build
           poetry run deployer update-lambda-functions ./aws-lambda
 
           # TODO: Depending on how long the upload takes, consider switching to

diff --git a/deployer/src/deployer/main.py b/deployer/src/deployer/main.py
@@ -39,6 +39,22 @@ def validate_optional_directory(ctx, param, value):
         return validate_directory(ctx, param, value)
 
 
+def validate_file(ctz, param, value):
+    if not value:
+        raise click.BadParameter(f"{value!r}")
+    path = Path(value)
+    if not path.exists():
+        raise click.BadParameter(f"{value} does not exist")
+    elif not path.is_file():
+        raise click.BadParameter(f"{value} is not a file")
+    return path
+
+
+def validate_optional_file(ctx, param, value):
+    if value:
+        return validate_file(ctx, param, value)
+
+
 @click.group()
 @click.option(
     "--dry-run",
@@ -151,6 +167,15 @@ def whatsdeployed(ctx, directory: Path, output: str):
     show_default=True,
     is_flag=True,
 )
+@click.option(
+    "--archived-files",
+    help=(
+        "The path to the file that lists which files are archived. "
+        "(Only relevant in conjunction with --prune)"
+    ),
+    default=None,
+    callback=validate_optional_file,
+)
 @click.argument("directory", type=click.Path(), callback=validate_directory)
 @click.pass_context
 def upload(ctx, directory: Path, **kwargs):
@@ -160,6 +185,12 @@ def upload(ctx, directory: Path, **kwargs):
         content_roots.append(kwargs["content_translated_root"])
     if kwargs["content_archived_root"]:
         content_roots.append(kwargs["content_archived_root"])
+
+    if kwargs["prune"] and not kwargs["archived_files"]:
+        log.warning(
+            "Warning! Running with --prune but NOT ----archived-files will "
+            "possibly delete all archived content."
+        )
     ctx.obj.update(kwargs)
     upload_content(directory, content_roots, ctx.obj)
 

diff --git a/deployer/src/deployer/upload.py b/deployer/src/deployer/upload.py
@@ -498,6 +498,14 @@ def delete(self, keys, on_task_complete=None, dry_run=False):
         return timer
 
 
+def parse_archived_txt_file(file: Path):
+    with open(file) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                yield line
+
+
 def upload_content(build_directory, content_roots, config):
     full_timer = StopWatch().start()
 
@@ -508,6 +516,7 @@ def upload_content(build_directory, content_roots, config):
     show_progress_bar = not config["no_progressbar"]
     upload_redirects = not config["no_redirects"]
     prune = config["prune"]
+    archived_txt_file = config["archived_files"]
 
     log.info(f"Upload files from: {build_directory}")
     if upload_redirects:
@@ -583,6 +592,15 @@ def on_task_complete(task):
         # now deleted.
         now = datetime.datetime.utcnow().replace(tzinfo=UTC)
         delete_keys = []
+
+        archived_files_as_keys = set()
+        if archived_txt_file:
+            for file in parse_archived_txt_file(archived_txt_file):
+                locale, slug = file.replace("/index.html", "").split("/", 1)
+                archived_files_as_keys.add(f"{bucket_prefix}/{locale}/docs/{slug}")
+            if not archived_files_as_keys:
+                raise Exception(f"found no entries inside {archived_txt_file}")
+
         for key in existing_bucket_objects:
             if key.startswith(f"{bucket_prefix}/_whatsdeployed/"):
                 # These are special and wouldn't have been uploaded
@@ -603,6 +621,42 @@ def on_task_complete(task):
                 if delta.days < 30:
                     continue
 
+            # Remember, if `key` is from a "index.html" file it will be represented
+            # something like this: `main/en-us/docs/web/api/documentorshadowroot`
+            # with the `/index.html` portion removed.
+            # But every page usually has a `index.json` file, which might look
+            # something like this: `main/en-us/docs/web/api/index.json` or
+            # `main/en-us/docs/web/api/screenshot.png`
+
+            # This if statement protects against possible deleting anything that
+            # isn't a document.
+            if "/docs/" in key:
+                is_archived = False
+                # Trying to avoid having to do another for-loop with key.startswith()
+                # so first look for the low-hanging fruit.
+                if key in archived_files_as_keys:
+                    # This is the easiest and fastest lookup
+                    is_archived = True
+                elif (
+                    re.sub(r"/(index\.json|contributors\.txt|bcd\.json)$", "", key)
+                    in archived_files_as_keys
+                ):
+                    # This is easy and fast too and covers 99% of the other
+                    # possible keys.
+                    is_archived = True
+                else:
+                    # This is for things like:
+                    # `main/en-us/docs/web/api/screenshot.png` where you can't
+                    # confidently use `path.dirname()` because the key could
+                    # be something like `main/fr/docs/web/api/manifest.json` which
+                    # is actually a "folder".
+                    for archive_file_as_key in archived_files_as_keys:
+                        if key.startswith(archive_file_as_key):
+                            is_archived = True
+                            break
+                if is_archived:
+                    continue
+
             assert key.startswith(bucket_prefix)
 
             delete_keys.append(key)