Skip to content

Commit

Permalink
Build: communicate uploading errors to users (#9941)
Browse files Browse the repository at this point in the history
* Build: check if the output directory is a directory

It checks for `_readthedocs/<format>` output directory to be a directory.
If it's not a directory, it silently skip that format for now.
This behavior will change once we move out `store_build_artifacts` from
`on_success` and we can communicate errors easily to the user.

Related #9931

* Build: refactor logic to validate artifacts output paths

Wrap all the logic into a class method so it's easier to follow and read.
It also remove some duplicated checks of `os.path.exists`,
making sure we are always using _validated_ paths already.

* Build: communicate uploading errors to users

Running `store_build_artifacts` from inside `.execute()` method
instead of from `.on_sucess()` allow us to raise any exception and handle it
properly.

We can raise a `BuildUserError` to clearly communicate the specific error to the
user, or any other exception to show the generic message: "Try again or contact
us".

Besides, this allows us to *fail the build* instead of continuing silently
without the user noticing this:

- there more than 1 file of formats that don't support multiple files
- the output path is not a directory
- there were errors uploading the artifacts to the storage

* Build: validate the output directory has at least 1 file

In case the `_readthedocs/<format>` directory is created by it contains 0 files,
we fail the build and communicate this to the user.

* Build: constant for artifact types that don't support multiple files
  • Loading branch information
humitos authored Jan 31, 2023
1 parent 5ab0da0 commit 0c0d2c6
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 83 deletions.
8 changes: 8 additions & 0 deletions readthedocs/builds/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,11 @@
"html",
"json",
)
# Artifacts that expect one and only one file in the output directory.
# NOTE: currently, this is a limitation that we are consider to remove
# https://github.com/readthedocs/readthedocs.org/issues/9931#issuecomment-1403415757
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT = (
"htmlzip",
"epub",
"pdf",
)
12 changes: 12 additions & 0 deletions readthedocs/doc_builder/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ class BuildUserError(BuildBaseException):
BUILD_COMMANDS_WITHOUT_OUTPUT = gettext_noop(
f'No "{BUILD_COMMANDS_OUTPUT_PATH_HTML}" folder was created during this build.'
)
BUILD_OUTPUT_IS_NOT_A_DIRECTORY = gettext_noop(
'Build output directory for format "{artifact_type}" is not a directory.'
)
BUILD_OUTPUT_HAS_0_FILES = gettext_noop(
'Build output directory for format "{artifact_type}" does not contain any files. '
"It seems the build process created the directory but did not save any file to it."
)
BUILD_OUTPUT_HAS_MULTIPLE_FILES = gettext_noop(
'Build output directory for format "{artifact_type}" contains multiple files '
"and it is not currently supported. "
'Please, remove all the files but the "{artifact_type}" your want to upload.'
)


class BuildUserSkip(BuildUserError):
Expand Down
189 changes: 113 additions & 76 deletions readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from readthedocs.builds import tasks as build_tasks
from readthedocs.builds.constants import (
ARTIFACT_TYPES,
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT,
BUILD_FINAL_STATES,
BUILD_STATE_BUILDING,
BUILD_STATE_CLONING,
Expand Down Expand Up @@ -494,7 +495,6 @@ def on_failure(self, exc, task_id, args, kwargs, einfo):
if self.data.version:
version_type = self.data.version.type

# NOTE: autoflake gets confused here. We need the NOQA for now.
status = BUILD_STATUS_FAILURE
if isinstance(exc, BuildUserSkip):
# The build was skipped by returning the magic exit code,
Expand All @@ -513,55 +513,91 @@ def on_failure(self, exc, task_id, args, kwargs, einfo):
# Update build object
self.data.build['success'] = False

def on_success(self, retval, task_id, args, kwargs):
time_before_store_build_artifacts = timezone.now()
# Store build artifacts to storage (local or cloud storage)
#
# TODO: move ``store_build_artifacts`` inside ``execute`` so we can
# handle exceptions properly on ``on_failure``
self.store_build_artifacts()
log.info(
"Store build artifacts finished.",
time=(timezone.now() - time_before_store_build_artifacts).seconds,
)
def get_valid_artifact_types(self):
"""
Return a list of all the valid artifact types for this build.
# HTML are built successfully.
html = os.path.exists(
self.data.project.artifact_path(
version=self.data.version.slug, type_="html"
)
)
localmedia = os.path.exists(
self.data.project.artifact_path(
version=self.data.version.slug, type_="htmlzip"
)
)
pdf = os.path.exists(
self.data.project.artifact_path(version=self.data.version.slug, type_="pdf")
)
epub = os.path.exists(
self.data.project.artifact_path(
version=self.data.version.slug, type_="epub"
It performs the following checks on each output format type path:
- it exists
- it is a directory
- does not contains more than 1 files (only PDF, HTMLZip, ePUB)
TODO: remove the limitation of only 1 file.
Add support for multiple PDF files in the output directory and
grab them by using glob syntaxt between other files that could be garbage.
"""
valid_artifacts = []
for artifact_type in ARTIFACT_TYPES:
artifact_directory = self.data.project.artifact_path(
version=self.data.version.slug,
type_=artifact_type,
)
)
if not os.path.exists(artifact_directory):
# There is no output directory.
# Skip this format.
continue

if not os.path.isdir(artifact_directory):
log.error(
"The output path is not a directory.",
output_format=artifact_type,
)
raise BuildUserError(
BuildUserError.BUILD_OUTPUT_IS_NOT_A_DIRECTORY.format(
artifact_type=artifact_type
)
)

# Check if there are multiple files on artifact directories.
# These output format does not support multiple files yet.
# In case multiple files are found, the upload for this format is not performed.
if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT:
artifact_format_files = len(os.listdir(artifact_directory))
if artifact_format_files > 1:
log.error(
"Multiple files are not supported for this format. "
"Skipping this output format.",
output_format=artifact_type,
)
raise BuildUserError(
BuildUserError.BUILD_OUTPUT_HAS_MULTIPLE_FILES.format(
artifact_type=artifact_type
)
)
elif artifact_format_files == 0:
raise BuildUserError(
BuildUserError.BUILD_OUTPUT_HAS_0_FILES.format(
artifact_type=artifact_type
)
)

# If all the conditions were met, the artifact is valid
valid_artifacts.append(artifact_type)

return valid_artifacts

def on_success(self, retval, task_id, args, kwargs):
valid_artifacts = self.get_valid_artifact_types()

# NOTE: we are updating the db version instance *only* when
if html:
# TODO: remove this condition and *always* update the DB Version instance
if "html" in valid_artifacts:
try:
api_v2.version(self.data.version.pk).patch(
{
"built": True,
"documentation_type": self.data.version.documentation_type,
"has_pdf": pdf,
"has_epub": epub,
"has_htmlzip": localmedia,
"has_pdf": "pdf" in valid_artifacts,
"has_epub": "epub" in valid_artifacts,
"has_htmlzip": "htmlzip" in valid_artifacts,
}
)
except HttpClientError:
# NOTE: I think we should fail the build if we cannot update
# the version at this point. Otherwise, we will have inconsistent data
log.exception(
'Updating version failed, skipping file sync.',
"Updating version db object failed. "
'Files are synced in the storage, but "Version" object is not updated',
)

# Index search data
Expand Down Expand Up @@ -658,10 +694,13 @@ def update_build(self, state=None):
try:
api_v2.build(self.data.build['id']).patch(self.data.build)
except Exception:
# NOTE: I think we should fail the build if we cannot update it
# at this point otherwise, the data will be inconsistent and we
# may be serving "new docs" but saying the "build have failed"
log.exception('Unable to update build')
# NOTE: we are updating the "Build" object on each `state`.
# Only if the last update fails, there may be some inconsistency
# between the "Build" object in our db and the reality.
#
# The `state` argument will help us to track this more and understand
# at what state our updates are failing and decide what to do.
log.exception("Error while updating the build object.", state=state)

def execute(self):
self.data.build_director = BuildDirector(
Expand Down Expand Up @@ -709,6 +748,12 @@ def execute(self):
finally:
self.data.build_data = self.collect_build_data()

# At this point, the user's build already succeeded.
# However, we cannot use `.on_success()` because we still have to upload the artifacts;
# which could fail, and we want to detect that and handle it properly at `.on_failure()`
# Store build artifacts to storage (local or cloud storage)
self.store_build_artifacts()

def collect_build_data(self):
"""
Collect data from the current build.
Expand Down Expand Up @@ -786,53 +831,30 @@ def store_build_artifacts(self):
Remove build artifacts of types not included in this build (PDF, ePub, zip only).
"""
time_before_store_build_artifacts = timezone.now()
log.info('Writing build artifacts to media storage')
# NOTE: I don't remember why we removed this state from the Build
# object. I'm re-adding it because I think it's useful, but we can
# remove it if we want
self.update_build(state=BUILD_STATE_UPLOADING)

valid_artifacts = self.get_valid_artifact_types()
log.bind(artifacts=valid_artifacts)

types_to_copy = []
types_to_delete = []

for artifact_type in ARTIFACT_TYPES:
if os.path.exists(
self.data.project.artifact_path(
version=self.data.version.slug,
type_=artifact_type,
)
):
if artifact_type in valid_artifacts:
types_to_copy.append(artifact_type)
# Never delete HTML nor JSON (search index)
elif artifact_type not in UNDELETABLE_ARTIFACT_TYPES:
types_to_delete.append(artifact_type)

# Upload formats
for media_type in types_to_copy:
# NOTE: here is where we get the correct FROM path to upload
from_path = self.data.version.project.artifact_path(
from_path = self.data.project.artifact_path(
version=self.data.version.slug,
type_=media_type,
)

# Check if there are multiple files on source directories.
# These output format does not support multiple files yet.
# In case multiple files are found, the upload for this format is not performed.
#
# TODO: we should fail the build for these cases and clearly communicate this.
# to the user. To do this, we need to call this method (``store_build_artifacts``)
# since the ``execute`` method.
# It will allow us to just `raise BuildUserError` and handle it at
# Celery `on_failure` handler.
if media_type in ("htmlzip", "epub", "pdf"):
if len(os.listdir(from_path)) > 1:
log.exception(
"Multiple files are not supported for this format. "
"Skipping this output format.",
output_format=media_type,
)
continue

to_path = self.data.version.project.get_storage_path(
to_path = self.data.project.get_storage_path(
type_=media_type,
version_slug=self.data.version.slug,
include_file=False,
Expand All @@ -844,15 +866,22 @@ def store_build_artifacts(self):
else:
build_media_storage.sync_directory(from_path, to_path)
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
# NOTE: the exceptions reported so far are:
# - botocore.exceptions:HTTPClientError
# - botocore.exceptions:ClientError
# - readthedocs.doc_builder.exceptions:BuildCancelled
log.exception(
'Error copying to storage (not failing build)',
"Error copying to storage",
media_type=media_type,
from_path=from_path,
to_path=to_path,
)
# Re-raise the exception to fail the build and handle it
# automatically at `on_failure`.
# It will clearly communicate the error to the user.
raise BuildAppError("Error uploading files to the storage.")

# Delete formats
for media_type in types_to_delete:
media_path = self.data.version.project.get_storage_path(
type_=media_type,
Expand All @@ -863,13 +892,21 @@ def store_build_artifacts(self):
try:
build_media_storage.delete_directory(media_path)
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
# NOTE: I didn't find any log line for this case yet
log.exception(
'Error deleting from storage (not failing build)',
"Error deleting files from storage",
media_type=media_type,
media_path=media_path,
)
# Re-raise the exception to fail the build and handle it
# automatically at `on_failure`.
# It will clearly communicate the error to the user.
raise BuildAppError("Error deleting files from storage.")

log.info(
"Store build artifacts finished.",
time=(timezone.now() - time_before_store_build_artifacts).seconds,
)

def send_notifications(self, version_pk, build_pk, event):
"""Send notifications to all subscribers of `event`."""
Expand Down
24 changes: 17 additions & 7 deletions readthedocs/projects/tests/test_build_tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pathlib
from unittest import mock

import django_dynamic_fixture as fixture
Expand Down Expand Up @@ -200,8 +201,14 @@ def test_build_updates_documentation_type(self, load_yaml_config):
# Create the artifact paths, so that `store_build_artifacts`
# properly runs: https://github.com/readthedocs/readthedocs.org/blob/faa611fad689675f81101ea643770a6b669bf529/readthedocs/projects/tasks/builds.py#L798-L804
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html"))
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub"))
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf"))
for f in ("epub", "pdf"):
os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f))
pathlib.Path(
os.path.join(
self.project.artifact_path(version=self.version.slug, type_=f),
f"{self.project.slug}.{f}",
)
).touch()

self._trigger_update_docs_task()

Expand Down Expand Up @@ -322,11 +329,14 @@ def test_successful_build(
# Create the artifact paths, so it's detected by the builder
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="html"))
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="json"))
os.makedirs(
self.project.artifact_path(version=self.version.slug, type_="htmlzip")
)
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="epub"))
os.makedirs(self.project.artifact_path(version=self.version.slug, type_="pdf"))
for f in ("htmlzip", "epub", "pdf"):
os.makedirs(self.project.artifact_path(version=self.version.slug, type_=f))
pathlib.Path(
os.path.join(
self.project.artifact_path(version=self.version.slug, type_=f),
f"{self.project.slug}.{f}",
)
).touch()

self._trigger_update_docs_task()

Expand Down

0 comments on commit 0c0d2c6

Please sign in to comment.