Skip to content

Commit

Permalink
Allow download retrying if the computed and configured checksums differ.
Browse files Browse the repository at this point in the history
- We have identified that downloading images under poor networking conditions
from datastores might fail silently due to the SDK libraries we use for
for Azure and AWS. This happens when we are installing new applications.
- Another observation we made is that when we manually try re-installing
the applications, they eventually succeed in the installation.
- With this patch, we automate retrying the image download a fixed number
of times (5 by default) if the computed versus the configure checksums differ.

Signed-off-by: Ioannis Sfakianakis <[email protected]>
  • Loading branch information
jsfakian committed Dec 10, 2024
1 parent 0b67531 commit 9884a04
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/CONFIG-PROPERTIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
| timer.port.testbetterinterval | timer in seconds | 600 | test a higher prio port config |
| network.fallback.any.eth | "enabled" or "disabled" | disabled (enabled forcefully during onboarding if no network config) | if no connectivity try any Ethernet, WiFi, or LTE with DHCP client |
| network.download.max.cost | 0-255 | 0 | [max port cost for download](DEVICE-CONNECTIVITY.md) to avoid e.g., LTE ports |
| blob.download.max.retries | 1-10 | 5 | max download retries when image verification fails.|
| debug.enable.usb | boolean | false | allow USB e.g. keyboards on device |
| debug.enable.vga | boolean | false | allow VGA console on device |
| debug.enable.ssh | authorized ssh key | empty string(ssh disabled) | allow ssh to EVE |
Expand Down
13 changes: 11 additions & 2 deletions pkg/pillar/cmd/downloader/downloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,18 @@ func handleModify(ctx *downloaderContext, key string,
status.Expired, status.Name)

// If RefCount from zero to non-zero and status has error
// or status is not downloaded then do install
if config.RefCount != 0 && (status.HasError() || status.State != types.DOWNLOADED) {
// or status is not downloaded or retry then do install
if config.RefCount != 0 && (status.HasError() ||
status.State != types.DOWNLOADED || config.RetryCount != 0) {
log.Functionf("handleModify installing %s", config.Name)
if config.RetryCount != 0 {
log.Functionf("handleModify retry download %s", config.Name)
status.CurrentSize = 0
status.Size = 0
status.Progress = 0
status.State = types.DOWNLOADING
publishDownloaderStatus(ctx, status)
}
handleCreate(ctx, config, status, key, receiveChan)
} else if status.RefCount != config.RefCount {
log.Functionf("handleModify RefCount change %s from %d to %d",
Expand Down
19 changes: 19 additions & 0 deletions pkg/pillar/cmd/volumemgr/handledownloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ func AddOrRefcountDownloaderConfig(ctx *volumemgrContext, blob types.BlobStatus)
Size: size,
Target: locFilename,
RefCount: refCount,
RetryCount: 0,
}
log.Functionf("AddOrRefcountDownloaderConfig: DownloaderConfig: %+v", n)
publishDownloaderConfig(ctx, &n)
Expand Down Expand Up @@ -105,6 +106,24 @@ func MaybeRemoveDownloaderConfig(ctx *volumemgrContext, imageSha string) {
log.Functionf("MaybeRemoveDownloaderConfig done for %s", imageSha)
}

func retryDownload(ctx *volumemgrContext, imageSha string) {
m := lookupDownloaderConfig(ctx, imageSha)
if m == nil {
log.Functionf("retryDownload: config missing for %s",
imageSha)
return
}
if m.RefCount == 0 {
log.Fatalf("retryDownload: Attempting to retry when "+
"RefCount is 0. Image Details - Name: %s, ImageSha: %s, ",
m.Name, m.ImageSha256)
}
m.RetryCount++

publishDownloaderConfig(ctx, m)
log.Functionf("retryDownload done for %s", imageSha)
}

func publishDownloaderConfig(ctx *volumemgrContext,
config *types.DownloaderConfig) {

Expand Down
16 changes: 16 additions & 0 deletions pkg/pillar/cmd/volumemgr/updatestatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,22 @@ func doUpdateContentTree(ctx *volumemgrContext, status *types.ContentTreeStatus)
blobErrorEntities = append(blobErrorEntities, &types.ErrorEntity{EntityID: blob.Sha256, EntityType: types.ErrorEntityContentBlob})

leftToProcess = true
if blob.IsErrorSource(types.VerifyImageStatus{}) && blob.RetryCount < blobDownloadMaxRetries {

// Remove VerifyImage config and retry download
MaybeRemoveVerifyImageConfig(ctx, blobSha)
retryDownload(ctx, blobSha)

// Increment retry count
blob.RetryCount++
blob.State = types.DOWNLOADING
// Remove VerifyImageConfig reference
blob.HasVerifierRef = false
blob.ClearErrorWithSource()

log.Errorf("EVE failed to verify Blob(%s), retrying %d/%d ...", blobSha, blob.RetryCount, blobDownloadMaxRetries)
publishBlobStatus(ctx, blob)
}
}
}

Expand Down
9 changes: 8 additions & 1 deletion pkg/pillar/cmd/volumemgr/volumemgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ const (
blankVolumeFormat = zconfig.Format_RAW // format of blank volume TODO: make configurable
)

var volumeFormat = make(map[string]zconfig.Format)
var (
blobDownloadMaxRetries uint32 = 5 // Unless from GlobalConfig
volumeFormat = make(map[string]zconfig.Format)
)

type volumemgrContext struct {
agentbase.AgentBase
Expand Down Expand Up @@ -746,6 +749,10 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string,
gcp := agentlog.HandleGlobalConfig(log, ctx.subGlobalConfig, agentName,
ctx.CLIParams().DebugOverride, logger)
if gcp != nil {
// Set max retries for blob download from global config
if gcp.GlobalValueInt(types.BlobDownloadMaxRetries) != 0 {
blobDownloadMaxRetries = gcp.GlobalValueInt(types.BlobDownloadMaxRetries)
}
maybeUpdateConfigItems(ctx, gcp)
ctx.globalConfig = gcp
ctx.GCInitialized = true
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type BlobStatus struct {
Progress uint
// ErrorAndTimeWithSource provide common error handling capabilities
ErrorAndTimeWithSource
RetryCount uint32
}

const (
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/downloadertypes.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ type DownloaderConfig struct {
Size uint64 // In bytes
FinalObjDir string // final Object Store
RefCount uint
RetryCount uint
}

func (config DownloaderConfig) Key() string {
Expand Down
5 changes: 5 additions & 0 deletions pkg/pillar/types/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ const (
// ports for image downloads.
DownloadMaxPortCost GlobalSettingKey = "network.download.max.cost"

// BlobDownloadMaxRetries global setting key
// how many times EVE will retry to download a blob if its checksum is not verified
BlobDownloadMaxRetries GlobalSettingKey = "blob.download.max.retries"

// Bool Items
// UsbAccess global setting key
UsbAccess GlobalSettingKey = "debug.enable.usb"
Expand Down Expand Up @@ -948,6 +952,7 @@ func NewConfigItemSpecMap() ConfigItemSpecMap {
// LogRemainToSendMBytes - Default is 2 Gbytes, minimum is 10 Mbytes
configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF)
configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255)
configItemSpecMap.AddIntItem(BlobDownloadMaxRetries, 5, 1, 0xFFFFFFFF)

// Goroutine Leak Detection section
configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF)
Expand Down
1 change: 1 addition & 0 deletions pkg/pillar/types/global_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ func TestNewConfigItemSpecMap(t *testing.T) {
ForceFallbackCounter,
LogRemainToSendMBytes,
DownloadMaxPortCost,
BlobDownloadMaxRetries,
// Bool Items
UsbAccess,
VgaAccess,
Expand Down

0 comments on commit 9884a04

Please sign in to comment.