Skip to content

Commit

Permalink
Allow download retrying if the computed and configured checksums differ.
Browse files Browse the repository at this point in the history
- We have identified that downloading images under poor networking conditions
from datastores might fail silently due to the SDK libraries we use for
for Azure and AWS. This happens when we are installing new applications.
- Another observation we made is that when we manually try re-installing
the applications, they eventually succeed in the installation.
- With this patch, we automate retrying the image download a fixed number
of times (5 by default) if the computed versus the configure checksums differ.

Signed-off-by: Ioannis Sfakianakis <[email protected]>
  • Loading branch information
jsfakian committed Dec 20, 2024
1 parent e872c81 commit 99af741
Show file tree
Hide file tree
Showing 12 changed files with 84 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ tools/compare-sbom-sources/vendor
tools/get-deps/get-deps
tools/get-deps/vendor
pkg/installer/target
pkg/installer/vendor
1 change: 1 addition & 0 deletions docs/CONFIG-PROPERTIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
| timer.port.testbetterinterval | timer in seconds | 600 | test a higher prio port config |
| network.fallback.any.eth | "enabled" or "disabled" | disabled (enabled forcefully during onboarding if no network config) | if no connectivity try any Ethernet, WiFi, or LTE with DHCP client |
| network.download.max.cost | 0-255 | 0 | [max port cost for download](DEVICE-CONNECTIVITY.md) to avoid e.g., LTE ports |
| blob.download.max.retries | 1-10 | 5 | max download retries when image verification fails.|
| debug.enable.usb | boolean | false | allow USB e.g. keyboards on device |
| debug.enable.vga | boolean | false | allow VGA console on device |
| debug.enable.ssh | authorized ssh key | empty string(ssh disabled) | allow ssh to EVE |
Expand Down
15 changes: 14 additions & 1 deletion pkg/pillar/cmd/downloader/downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,22 @@ func handleModify(ctx *downloaderContext, key string,

// If RefCount from zero to non-zero and status has error
// or status is not downloaded then do install
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) {
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED ||
status.LastRetry != config.LastRetry) {
log.Functionf("handleModify installing %s", config.Name)
if status.LastRetry != config.LastRetry {
log.Functionf("handleModify retry download %s", config.Name)
status.CurrentSize = 0
status.Size = 0
status.Progress = 0
status.State = types.DOWNLOADING
status.LastRetry = config.LastRetry
publishDownloaderStatus(ctx, status)
}
handleCreate(ctx, config, status, key, receiveChan)
// Retrying the download due to image verification failure
// This happens when: RefCount and retryCount is non-zero,
// and DownloaderStatus state is "downloaded"
} else if status.RefCount != config.RefCount {
log.Functionf("handleModify RefCount change %s from %d to %d",
config.Name, status.RefCount, config.RefCount)
Expand Down
6 changes: 2 additions & 4 deletions pkg/pillar/cmd/volumemgr/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,12 +484,10 @@ func unpublishBlobStatus(ctx *volumemgrContext, blobs ...*types.BlobStatus) {
// But the BlobStatus pointer might appear several times in
// the list hence we better clear the Has*Ref
if blob.HasDownloaderRef {
MaybeRemoveDownloaderConfig(ctx, blob.Sha256)
blob.HasDownloaderRef = false
MaybeRemoveDownloaderConfig(ctx, blob)
}
if blob.HasVerifierRef {
MaybeRemoveVerifyImageConfig(ctx, blob.Sha256)
blob.HasVerifierRef = false
MaybeRemoveVerifyImageConfig(ctx, blob)
}
//If blob is loaded, then remove it from CAS
if blob.State == types.LOADED {
Expand Down
28 changes: 27 additions & 1 deletion pkg/pillar/cmd/volumemgr/handledownloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"encoding/hex"
"path"
"strconv"
"time"

"github.com/lf-edge/eve/pkg/pillar/types"
)
Expand All @@ -18,10 +19,12 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
log.Functionf("AddOrRefcountDownloaderConfig for %s", blob.Sha256)

refCount := uint(1)
lastRetry := time.Now()
m := lookupDownloaderConfig(ctx, blob.Sha256)
if m != nil {
log.Functionf("downloader config exists for %s to refcount %d", blob.Sha256, m.RefCount)
refCount = m.RefCount + 1
lastRetry = m.LastRetry
// We need to update datastore id before publishing the
// datastore config because datastore id can be updated
// in some cases. For example:
Expand Down Expand Up @@ -64,6 +67,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
Size: size,
Target: locFilename,
RefCount: refCount,
LastRetry: lastRetry,
}
log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n)
publishDownloaderConfig(ctx, &n)
Expand All @@ -83,7 +87,8 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
// ignored silently.
// > If DownloaderConfig's Refcount was incremented before #3, then expired notification from the
// Downloader will be ignored silently.
func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, blob *types.BlobStatus) {
imageSha := blob.Sha256
log.Functionf("MaybeRemoveDownloaderConfig(%s)", imageSha)

m := lookupDownloaderConfig(ctx, imageSha)
Expand All @@ -102,9 +107,30 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
m.RefCount, imageSha)

publishDownloaderConfig(ctx, m)

// Remove downloader config reference from blob
blob.HasDownloaderRef = false
log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha)
}

func retryDownload(ctx *volumemgrContext, imageSha string) {
m := lookupDownloaderConfig(ctx, imageSha)
if m == nil {
log.Functionf("retryDownload: config missing for %s",
imageSha)
return
}
if m.RefCount == 0 {
log.Warnf("retryDownload: Attempting to retry when "+
"RefCount is 0. Image Details - Name: %s, ImageSha: %s, ",
m.Name, m.ImageSha256)
}
m.LastRetry = time.Now()

publishDownloaderConfig(ctx, m)
log.Functionf("retryDownload done for %s", imageSha)
}

func publishDownloaderConfig(ctx *volumemgrContext,
config *types.DownloaderConfig) {

Expand Down
6 changes: 5 additions & 1 deletion pkg/pillar/cmd/volumemgr/handleverifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ func MaybeAddVerifyImageConfigBlob(ctx *volumemgrContext, blob types.BlobStatus)
// However, MaybeAddVerifyImageConfig can be called to increment the refcount
// since the handshake with the verifier will not conclude until the
// VerifyImageConfig is unpublished
func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) {
func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, blob *types.BlobStatus) {
imageSha := blob.Sha256

log.Functionf("MaybeRemoveVerifyImageConfig(%s)", imageSha)

Expand All @@ -111,6 +112,9 @@ func MaybeRemoveVerifyImageConfig(ctx *volumemgrContext, imageSha string) {
publishVerifyImageConfig(ctx, m)
}
log.Functionf("MaybeRemoveVerifyImageConfig done for %s", imageSha)

// Remove the has verifier reference from blob status
blob.HasVerifierRef = false
}

// deleteVerifyImageConfig checks the refcount and if it is zero it
Expand Down
20 changes: 16 additions & 4 deletions pkg/pillar/cmd/volumemgr/updatestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,20 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob})

leftToProcess = true
if blob.IsErrorSource(types.VerifyImageStatus{}) && blob.RetryCount < blobDownloadMaxRetries {

// Remove VerifyImage config and retry download
MaybeRemoveVerifyImageConfig(ctx, blob)
retryDownload(ctx, blobSha)

// Increment retry count
blob.RetryCount++
blob.State = types.DOWNLOADING
blob.ClearErrorWithSource()

log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries)
publishBlobStatus(ctx, blob)
}
}
}

Expand Down Expand Up @@ -412,15 +426,13 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
if blob.HasDownloaderRef {
log.Functionf("doUpdateContentTree(%s): removing downloaderRef from Blob %s",
status.Key(), blob.Sha256)
MaybeRemoveDownloaderConfig(ctx, blob.Sha256)
blob.HasDownloaderRef = false
MaybeRemoveDownloaderConfig(ctx, blob)
}
if blob.HasVerifierRef {
log.Functionf("doUpdateContentTree(%s): removing verifyRef from Blob %s",
status.Key(), blob.Sha256)
MaybeRemoveVerifyImageConfig(ctx, blob.Sha256)
MaybeRemoveVerifyImageConfig(ctx, blob)
// Set the path to "" as we delete the verifier path
blob.HasVerifierRef = false
blob.Path = ""
}
publishBlobStatus(ctx, blob)
Expand Down
9 changes: 8 additions & 1 deletion pkg/pillar/cmd/volumemgr/volumemgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ const (
blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable
)

var volumeFormat = make(map[string]zconfig.Format)
var (
blobDownloadMaxRetries uint32 = 5 // Unless from GlobalConfig
volumeFormat = make(map[string]zconfig.Format)
)

type volumemgrContext struct {
agentbase.AgentBase
Expand Down Expand Up @@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string,
gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName,
ctx.CLIParams().DebugOverride, logger)
if gcp != nil {
// Set max retries for blob download from global config
if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 {
blobDownloadMaxRetries = gcp.GlobalValueInt(types.BlobDownloadMaxRetries)
}
maybeUpdateConfigItems(ctx, gcp)
ctx.globalConfig = gcp
ctx.GCInitialized = true
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type BlobStatus struct {
Progress uint
// ErrorAndTimeWithSource provide common error handling capabilities
ErrorAndTimeWithSource
RetryCount uint32
}

const (
Expand Down
3 changes: 3 additions & 0 deletions pkg/pillar/types/downloadertypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type DownloaderConfig struct {
Size uint64 // In bytes
FinalObjDir string // final Object Store
RefCount uint
LastRetry time.Time
}

func (config DownloaderConfig) Key() string {
Expand Down Expand Up @@ -119,6 +120,8 @@ type DownloaderStatus struct {
RetryCount int
// We save the original error when we do a retry
OrigError string
// Used only when image verification fails after the download
LastRetry time.Time
}

func (status DownloaderStatus) Key() string {
Expand Down
5 changes: 5 additions & 0 deletions pkg/pillar/types/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ const (
// ports for image downloads.
DownloadMaxPortCost GlobalSettingKey = "network.download.max.cost"

// BlobDownloadMaxRetries global setting key
// how many times EVE will retry to download a blob if its checksum is not verified
BlobDownloadMaxRetries GlobalSettingKey = "blob.download.max.retries"

// Bool Items
// UsbAccess global setting key
UsbAccess GlobalSettingKey = "debug.enable.usb"
Expand Down Expand Up @@ -959,6 +963,7 @@ func NewConfigItemSpecMap() ConfigItemSpecMap {
// LogRemainToSendMBytes - Default is 2 Gbytes, minimum is 10 Mbytes
configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF)
configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255)
configItemSpecMap.AddIntItem(BlobDownloadMaxRetries, 5, 1, 10)

// Goroutine Leak Detection section
configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF)
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/global_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ func TestNewConfigItemSpecMap(t *testing.T) {
ForceFallbackCounter,
LogRemainToSendMBytes,
DownloadMaxPortCost,
BlobDownloadMaxRetries,
// Bool Items
UsbAccess,
VgaAccess,
Expand Down

0 comments on commit 99af741

Please sign in to comment.