From 0a12c5224b0b6b6d937311e6d6d81c26b03b1d9d Mon Sep 17 00:00:00 2001 From: EmilLuta Date: Tue, 11 Jun 2024 18:10:53 +0200 Subject: [PATCH] fix: Treat 502s and 503s as transient for GCS OS (#2202) A refactoring introduced lately caused multiple restarts in provers (namely BWGs) when GCS was unavailable (502 or 503). This is a sporadic, once in a while, but still invalides tens of minutes of work and makes proving fickle and slow. This PR addresses the issue and restores old behavior pre-refactoring, treating 502s and 503s as transient errors. --- Cargo.lock | 1 + core/lib/object_store/Cargo.toml | 1 + core/lib/object_store/src/gcs.rs | 9 +++++++-- prover/Cargo.lock | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0bb1fd0fced5..6feb9b0e472f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9036,6 +9036,7 @@ dependencies = [ "http", "prost 0.12.1", "rand 0.8.5", + "reqwest", "serde_json", "tempfile", "tokio", diff --git a/core/lib/object_store/Cargo.toml b/core/lib/object_store/Cargo.toml index 3e33c9097153..e400642bd2cd 100644 --- a/core/lib/object_store/Cargo.toml +++ b/core/lib/object_store/Cargo.toml @@ -26,6 +26,7 @@ rand.workspace = true tokio = { workspace = true, features = ["full"] } tracing.workspace = true prost.workspace = true +reqwest.workspace = true [dev-dependencies] assert_matches.workspace = true diff --git a/core/lib/object_store/src/gcs.rs b/core/lib/object_store/src/gcs.rs index 65d31bf53eaf..6960bd51f2fb 100644 --- a/core/lib/object_store/src/gcs.rs +++ b/core/lib/object_store/src/gcs.rs @@ -87,7 +87,7 @@ impl From for ObjectStoreError { fn from(err: AuthError) -> Self { let is_transient = matches!( &err, - AuthError::HttpError(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) + AuthError::HttpError(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) || upstream_unavailable(err) ); Self::Initialization { source: err.into(), @@ -111,6 +111,11 @@ fn has_transient_io_source(mut err: &(dyn StdError + 'static)) -> bool { } } +fn upstream_unavailable(err: &reqwest::Error) -> bool { + err.status() == Some(StatusCode::BAD_GATEWAY) + || err.status() == Some(StatusCode::SERVICE_UNAVAILABLE) +} + impl From for ObjectStoreError { fn from(err: HttpError) -> Self { let is_not_found = match &err { @@ -126,7 +131,7 @@ impl From for ObjectStoreError { } else { let is_transient = matches!( &err, - HttpError::HttpClient(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) + HttpError::HttpClient(err) if err.is_timeout() || err.is_connect() || has_transient_io_source(err) || upstream_unavailable(err) ); ObjectStoreError::Other { is_transient, diff --git a/prover/Cargo.lock b/prover/Cargo.lock index d2de12c5682e..571bc59c18c3 100644 --- a/prover/Cargo.lock +++ b/prover/Cargo.lock @@ -9158,6 +9158,7 @@ dependencies = [ "http", "prost 0.12.6", "rand 0.8.5", + "reqwest", "serde_json", "tokio", "tracing",