From 99af741ff6d38a93ef3034d6e5bc59a6d878137d Mon Sep 17 00:00:00 2001 From: Ioannis Sfakianakis Date: Wed, 6 Nov 2024 17:30:06 +0200 Subject: [PATCH] Allow download retrying if the computed and configured checksums differ. - We have identified that downloading images under poor networking conditions from datastores might fail silently due to the SDK libraries we use for for Azure and AWS. This happens when we are installing new applications. - Another observation we made is that when we manually try re-installing the applications, they eventually succeed in the installation. - With this patch, we automate retrying the image download a fixed number of times (5 by default) if the computed versus the configure checksums differ. Signed-off-by: Ioannis Sfakianakis --- .gitignore | 1 + docs/CONFIG-PROPERTIES.md | 1 + pkg/pillar/cmd/downloader/downloader.go | 15 ++++++++++- pkg/pillar/cmd/volumemgr/blob.go | 6 ++--- pkg/pillar/cmd/volumemgr/handledownloader.go | 28 +++++++++++++++++++- pkg/pillar/cmd/volumemgr/handleverifier.go | 6 ++++- pkg/pillar/cmd/volumemgr/updatestatus.go | 20 +++++++++++--- pkg/pillar/cmd/volumemgr/volumemgr.go | 9 ++++++- pkg/pillar/types/blob.go | 1 + pkg/pillar/types/downloadertypes.go | 3 +++ pkg/pillar/types/global.go | 5 ++++ pkg/pillar/types/global_test.go | 1 + 12 files changed, 84 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index eba888ad8e..213389c3d5 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ tools/compare-sbom-sources/vendor tools/get-deps/get-deps tools/get-deps/vendor pkg/installer/target +pkg/installer/vendor diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index 3c537516bd..f6c3fa7047 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -29,6 +29,7 @@ | timer.port.testbetterinterval | timer in seconds | 600 | test a higher prio port config | | network.fallback.any.eth | "enabled" or "disabled" | disabled (enabled forcefully during onboarding if no network config) | if no connectivity try any Ethernet, WiFi, or LTE with DHCP client | | network.download.max.cost | 0-255 | 0 | [max port cost for download](DEVICE-CONNECTIVITY.md) to avoid e.g., LTE ports | +| blob.download.max.retries | 1-10 | 5 | max download retries when image verification fails.| | debug.enable.usb | boolean | false | allow USB e.g. keyboards on device | | debug.enable.vga | boolean | false | allow VGA console on device | | debug.enable.ssh | authorized ssh key | empty string(ssh disabled) | allow ssh to EVE | diff --git a/pkg/pillar/cmd/downloader/downloader.go b/pkg/pillar/cmd/downloader/downloader.go index 48c16395df..9d0632ac4c 100644 --- a/pkg/pillar/cmd/downloader/downloader.go +++ b/pkg/pillar/cmd/downloader/downloader.go @@ -399,9 +399,22 @@ func handleModify(ctx *downloaderContext, key string, // If RefCount from zero to non-zero and status has error // or status is not downloaded then do install - if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) { + if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED || + status.LastRetry != config.LastRetry) { log.Functionf("handleModify installing %s", config.Name) + if status.LastRetry != config.LastRetry { + log.Functionf("handleModify retry download %s", config.Name) + status.CurrentSize = 0 + status.Size = 0 + status.Progress = 0 + status.State = types.DOWNLOADING + status.LastRetry = config.LastRetry + publishDownloaderStatus(ctx, status) + } handleCreate(ctx, config, status, key, receiveChan) + // Retrying the download due to image verification failure + // This happens when: RefCount and retryCount is non-zero, + // and DownloaderStatus state is "downloaded" } else if status.RefCount != config.RefCount { log.Functionf("handleModify RefCount change %s from %d to %d", config.Name, status.RefCount, config.RefCount) diff --git a/pkg/pillar/cmd/volumemgr/blob.go b/pkg/pillar/cmd/volumemgr/blob.go index 8bc5043dcd..5c5389bdd6 100644 --- a/pkg/pillar/cmd/volumemgr/blob.go +++ b/pkg/pillar/cmd/volumemgr/blob.go @@ -484,12 +484,10 @@ func unpublishBlobStatus(ctx *volumemgrContext, blobs ...*types.BlobStatus) { // But the BlobStatus pointer might appear several times in // the list hence we better clear the Has*Ref if blob.HasDownloaderRef { - MaybeRemoveDownloaderConfig(ctx, blob.Sha256) - blob.HasDownloaderRef = false + MaybeRemoveDownloaderConfig(ctx, blob) } if blob.HasVerifierRef { - MaybeRemoveVerifyImageConfig(ctx, blob.Sha256) - blob.HasVerifierRef = false + MaybeRemoveVerifyImageConfig(ctx, blob) } //If blob is loaded, then remove it from CAS if blob.State == types.LOADED { diff --git a/pkg/pillar/cmd/volumemgr/handledownloader.go b/pkg/pillar/cmd/volumemgr/handledownloader.go index a51074d091..734fa38407 100644 --- a/pkg/pillar/cmd/volumemgr/handledownloader.go +++ b/pkg/pillar/cmd/volumemgr/handledownloader.go @@ -8,6 +8,7 @@ import ( "encoding/hex" "path" "strconv" + "time" "github.com/lf-edge/eve/pkg/pillar/types" ) @@ -18,10 +19,12 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus) log.Functionf("AddOrRefcountDownloaderConfig for %s", blob.Sha256) refCount := uint(1) + lastRetry := time.Now() m := lookupDownloaderConfig(ctx, blob.Sha256) if m != nil { log.Functionf("downloader config exists for %s to refcount %d", blob.Sha256, m.RefCount) refCount = m.RefCount + 1 + lastRetry = m.LastRetry // We need to update datastore id before publishing the // datastore config because datastore id can be updated // in some cases. For example: @@ -64,6 +67,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus) Size: size, Target: locFilename, RefCount: refCount, + LastRetry: lastRetry, } log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n) publishDownloaderConfig(ctx, &n) @@ -83,7 +87,8 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus) // ignored silently. // > If DownloaderConfig's Refcount was incremented before #3, then expired notification from the // Downloader will be ignored silently. -func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) { +func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, blob *types.BlobStatus) { + imageSha := blob.Sha256 log.Functionf("MaybeRemoveDownloaderConfig(%s)", imageSha) m := lookupDownloaderConfig(ctx, imageSha) @@ -102,9 +107,30 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) { m.RefCount, imageSha) publishDownloaderConfig(ctx, m) + + // Remove downloader config reference from blob + blob.HasDownloaderRef = false log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha) } +func retryDownload(ctx *volumemgrContext, imageSha string) { + m := lookupDownloaderConfig(ctx, imageSha) + if m == nil { + log.Functionf("retryDownload: config missing for %s", + imageSha) + return + } + if m.RefCount == 0 { + log.Warnf("retryDownload: Attempting to retry when "+ + "RefCount is 0. Image Details - Name: %s, ImageSha: %s, ", + m.Name, m.ImageSha256) + } + m.LastRetry = time.Now() + + publishDownloaderConfig(ctx, m) + log.Functionf("retryDownload done for %s", imageSha) +} + func publishDownloaderConfig(ctx *volumemgrContext, config *types.DownloaderConfig) { diff --git a/pkg/pillar/cmd/volumemgr/handleverifier.go b/pkg/pillar/cmd/volumemgr/handleverifier.go index f76cade519..3200d121f1 100644 --- a/pkg/pillar/cmd/volumemgr/handleverifier.go +++ b/pkg/pillar/cmd/volumemgr/handleverifier.go @@ -86,7 +86,8 @@ func MaybeAddVerifyImageConfigBlob(ctx *volumemgrContext, blob types.BlobStatus) // However, MaybeAddVerifyImageConfig can be called to increment the refcount // since the handshake with the verifier will not conclude until the // VerifyImageConfig is unpublished -func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) { +func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, blob *types.BlobStatus) { + imageSha := blob.Sha256 log.Functionf("MaybeRemoveVerifyImageConfig(%s)", imageSha) @@ -111,6 +112,9 @@ func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) { publishVerifyImageConfig(ctx, m) } log.Functionf("MaybeRemoveVerifyImageConfig done for %s", imageSha) + + // Remove the has verifier reference from blob status + blob.HasVerifierRef = false } // deleteVerifyImageConfig checks the refcount and if it is zero it diff --git a/pkg/pillar/cmd/volumemgr/updatestatus.go b/pkg/pillar/cmd/volumemgr/updatestatus.go index f2def1f7d8..64a65823d2 100644 --- a/pkg/pillar/cmd/volumemgr/updatestatus.go +++ b/pkg/pillar/cmd/volumemgr/updatestatus.go @@ -227,6 +227,20 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus) blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob}) leftToProcess = true + if blob.IsErrorSource(types.VerifyImageStatus{}) && blob.RetryCount < blobDownloadMaxRetries { + + // Remove VerifyImage config and retry download + MaybeRemoveVerifyImageConfig(ctx, blob) + retryDownload(ctx, blobSha) + + // Increment retry count + blob.RetryCount++ + blob.State = types.DOWNLOADING + blob.ClearErrorWithSource() + + log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries) + publishBlobStatus(ctx, blob) + } } } @@ -412,15 +426,13 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus) if blob.HasDownloaderRef { log.Functionf("doUpdateContentTree(%s): removing downloaderRef from Blob %s", status.Key(), blob.Sha256) - MaybeRemoveDownloaderConfig(ctx, blob.Sha256) - blob.HasDownloaderRef = false + MaybeRemoveDownloaderConfig(ctx, blob) } if blob.HasVerifierRef { log.Functionf("doUpdateContentTree(%s): removing verifyRef from Blob %s", status.Key(), blob.Sha256) - MaybeRemoveVerifyImageConfig(ctx, blob.Sha256) + MaybeRemoveVerifyImageConfig(ctx, blob) // Set the path to "" as we delete the verifier path - blob.HasVerifierRef = false blob.Path = "" } publishBlobStatus(ctx, blob) diff --git a/pkg/pillar/cmd/volumemgr/volumemgr.go b/pkg/pillar/cmd/volumemgr/volumemgr.go index fa1aeea83d..2894e36f41 100644 --- a/pkg/pillar/cmd/volumemgr/volumemgr.go +++ b/pkg/pillar/cmd/volumemgr/volumemgr.go @@ -42,7 +42,10 @@ const ( blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable ) -var volumeFormat = make(map[string]zconfig.Format) +var ( + blobDownloadMaxRetries uint32 = 5 // Unless from GlobalConfig + volumeFormat = make(map[string]zconfig.Format) +) type volumemgrContext struct { agentbase.AgentBase @@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName, ctx.CLIParams().DebugOverride, logger) if gcp != nil { + // Set max retries for blob download from global config + if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 { + blobDownloadMaxRetries = gcp.GlobalValueInt(types.BlobDownloadMaxRetries) + } maybeUpdateConfigItems(ctx, gcp) ctx.globalConfig = gcp ctx.GCInitialized = true diff --git a/pkg/pillar/types/blob.go b/pkg/pillar/types/blob.go index f0ed8adceb..1475cfbbbf 100644 --- a/pkg/pillar/types/blob.go +++ b/pkg/pillar/types/blob.go @@ -47,6 +47,7 @@ type BlobStatus struct { Progress uint // ErrorAndTimeWithSource provide common error handling capabilities ErrorAndTimeWithSource + RetryCount uint32 } const ( diff --git a/pkg/pillar/types/downloadertypes.go b/pkg/pillar/types/downloadertypes.go index c9ae610425..b496ba0744 100644 --- a/pkg/pillar/types/downloadertypes.go +++ b/pkg/pillar/types/downloadertypes.go @@ -23,6 +23,7 @@ type DownloaderConfig struct { Size uint64 // In bytes FinalObjDir string // final Object Store RefCount uint + LastRetry time.Time } func (config DownloaderConfig) Key() string { @@ -119,6 +120,8 @@ type DownloaderStatus struct { RetryCount int // We save the original error when we do a retry OrigError string + // Used only when image verification fails after the download + LastRetry time.Time } func (status DownloaderStatus) Key() string { diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 0c46553337..92db699b36 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -220,6 +220,10 @@ const ( // ports for image downloads. DownloadMaxPortCost GlobalSettingKey = "network.download.max.cost" + // BlobDownloadMaxRetries global setting key + // how many times EVE will retry to download a blob if its checksum is not verified + BlobDownloadMaxRetries GlobalSettingKey = "blob.download.max.retries" + // Bool Items // UsbAccess global setting key UsbAccess GlobalSettingKey = "debug.enable.usb" @@ -959,6 +963,7 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { // LogRemainToSendMBytes - Default is 2 Gbytes, minimum is 10 Mbytes configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF) configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255) + configItemSpecMap.AddIntItem(BlobDownloadMaxRetries, 5, 1, 10) // Goroutine Leak Detection section configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF) diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index 022c0a25d7..21c45b91d5 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -177,6 +177,7 @@ func TestNewConfigItemSpecMap(t *testing.T) { ForceFallbackCounter, LogRemainToSendMBytes, DownloadMaxPortCost, + BlobDownloadMaxRetries, // Bool Items UsbAccess, VgaAccess,