-
Notifications
You must be signed in to change notification settings - Fork 535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Restart summarizer after 10 seconds #15140
Changes from all commits
73d89cf
f4f55de
b255170
3b13925
d52ad59
4ca1cd0
87d5f6f
432d571
2e037d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ import { | |
} from "@fluidframework/container-runtime-definitions"; | ||
import { | ||
assert, | ||
delay, | ||
LazyPromise, | ||
Trace, | ||
TypedEventEmitter, | ||
|
@@ -508,6 +509,13 @@ const defaultCompressionConfig = { | |
|
||
const defaultChunkSizeInBytes = 204800; | ||
|
||
/** | ||
* Instead of refreshing from latest because we do not have 100% confidence in the state | ||
* of the current system, we should close the summarizer and let it recover. | ||
* This delay's goal is to prevent tight restart loops | ||
*/ | ||
const defaultCloseSummarizerDelayMs = 10000; // 10 seconds | ||
|
||
/** | ||
* @deprecated - use ContainerRuntimeMessage instead | ||
*/ | ||
|
@@ -910,6 +918,8 @@ export class ContainerRuntime | |
private emitDirtyDocumentEvent = true; | ||
private readonly enableOpReentryCheck: boolean; | ||
private readonly disableAttachReorder: boolean | undefined; | ||
private readonly summaryStateUpdateMethod: string | undefined; | ||
private readonly closeSummarizerDelayMs: number; | ||
|
||
private readonly defaultTelemetrySignalSampleCount = 100; | ||
private _perfSignalData: IPerfSignalReport = { | ||
|
@@ -1283,6 +1293,14 @@ export class ContainerRuntime | |
this.remoteMessageProcessor.clearPartialMessagesFor(clientId); | ||
}); | ||
|
||
this.summaryStateUpdateMethod = this.mc.config.getString( | ||
"Fluid.ContainerRuntime.Test.SummaryStateUpdateMethod", | ||
); | ||
const closeSummarizerDelayOverride = this.mc.config.getNumber( | ||
"Fluid.ContainerRuntime.Test.CloseSummarizerDelayOverrideMs", | ||
); | ||
this.closeSummarizerDelayMs = closeSummarizerDelayOverride ?? defaultCloseSummarizerDelayMs; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Combine this with the above statement. The extra variable is not needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need this to be logged in telemetry. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant remove the const closeSummarizerDelayOverride |
||
|
||
this.summaryCollection = new SummaryCollection(this.deltaManager, this.logger); | ||
|
||
this.dirtyContainer = | ||
|
@@ -1423,6 +1441,8 @@ export class ContainerRuntime | |
disableChunking, | ||
disableAttachReorder: this.disableAttachReorder, | ||
disablePartialFlush, | ||
abnormalAckRecoveryMethod: this.summaryStateUpdateMethod, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tyler-cai-microsoft I believe you missed updating this property name in the log. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
closeSummarizerDelayOverride, | ||
}), | ||
telemetryDocumentId: this.telemetryDocumentId, | ||
groupedBatchingEnabled: this.groupedBatchingEnabled, | ||
|
@@ -3210,21 +3230,23 @@ export class ContainerRuntime | |
readAndParseBlob: ReadAndParseBlob, | ||
versionId: string | null, | ||
): Promise<{ snapshotTree: ISnapshotTree; versionId: string; latestSnapshotRefSeq: number }> { | ||
const recoveryMethod = this.mc.config.getString( | ||
"Fluid.ContainerRuntime.Test.SummarizationRecoveryMethod", | ||
); | ||
if (recoveryMethod === "restart") { | ||
if (this.summaryStateUpdateMethod === "restart") { | ||
const error = new GenericError("Restarting summarizer instead of refreshing"); | ||
|
||
this.mc.logger.sendTelemetryEvent( | ||
{ | ||
...event, | ||
eventName: "ClosingSummarizerOnSummaryStale", | ||
codePath: event.eventName, | ||
message: "Stopping fetch from storage", | ||
versionId: versionId != null ? versionId : undefined, | ||
closeSummarizerDelayMs: this.closeSummarizerDelayMs, | ||
}, | ||
error, | ||
); | ||
|
||
// Delay 10 seconds before restarting summarizer to prevent the summarizer from restarting too frequently. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 10 seconds seem like a arbitrary and long time. Any reason for picking up 10 seconds? There should be some way of telling from telemetry whether this is high / low and adjust it accordingly. You should consider adding some. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is 10 seconds problematic? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may not be. But we need data to know that. For example, a busy doc can have thousands of ops in 10 seconds which make the next summary that much more expensive. Or it may turn out that 10 seconds is less and clients keep reloading. It's fine as a starting point but I was interested as to how you came up with 10 seconds? Plus there should be telemetry that tells us whether that number is good enough or not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added telemetry, we will not know if 10 seconds is enough or not until we see the telemetry. It's an arbitrary number we came up with offline. If the summarizer cache is 60 seconds, we would at most restart 5 times which is not that bad of a worse case scenario. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's safety against tight loops that could happen if we have some state (that we can't envision) that results in infinite reloads (or possibly - tight reloads for 60 seconds, or something like that). It's arbitrary, and we should reevaluate it once we have more data from PROD. In ideal case we should not hit it :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not familiar with the summarizer lifecycle; but wondering if waiting 10 seconds here before stopping it VS after, makes a difference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so since this would be part of the entire submit summary call, but potentially since we are stopping the summarizer and a different promise may resolve. It's much safer to wait before we call stop |
||
await delay(this.closeSummarizerDelayMs); | ||
this._summarizer?.stop("latestSummaryStateStale"); | ||
this.closeFn(); | ||
throw error; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,7 +46,6 @@ import { | |
const maxSummarizeAckWaitTime = 10 * 60 * 1000; // 10 minutes | ||
|
||
const defaultNumberSummarizationAttempts = 2; // only up to 2 attempts | ||
const numberOfAttemptsOnRestartAsRecovery = 1; // Only summarize once | ||
|
||
/** | ||
* An instance of RunningSummarizer manages the heuristics for summarizing. | ||
|
@@ -133,7 +132,6 @@ export class RunningSummarizer implements IDisposable { | |
private heuristicRunner?: ISummarizeHeuristicRunner; | ||
private readonly generator: SummaryGenerator; | ||
private readonly mc: MonitoringContext; | ||
private readonly shouldAbortOnSummaryFailure: boolean; | ||
|
||
private enqueuedSummary: | ||
| { | ||
|
@@ -177,10 +175,6 @@ export class RunningSummarizer implements IDisposable { | |
}), | ||
); | ||
|
||
this.shouldAbortOnSummaryFailure = | ||
this.mc.config.getString("Fluid.ContainerRuntime.Test.SummarizationRecoveryMethod") === | ||
"restart"; | ||
|
||
if (configuration.state !== "disableHeuristics") { | ||
assert( | ||
this.configuration.state === "enabled", | ||
|
@@ -601,10 +595,9 @@ export class RunningSummarizer implements IDisposable { | |
let summaryAttempts = 0; | ||
let summaryAttemptsPerPhase = 0; | ||
// Reducing the default number of attempts to defaultNumberofSummarizationAttempts. | ||
let totalAttempts = this.shouldAbortOnSummaryFailure | ||
? numberOfAttemptsOnRestartAsRecovery | ||
: this.mc.config.getNumber("Fluid.Summarizer.Attempts") ?? | ||
defaultNumberSummarizationAttempts; | ||
let totalAttempts = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not doing a second attempt here is the correct approach since we already know the second attempt is going to fail. There is no point in doing the extra work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
this.mc.config.getNumber("Fluid.Summarizer.Attempts") ?? | ||
defaultNumberSummarizationAttempts; | ||
|
||
if (totalAttempts > attempts.length) { | ||
this.mc.logger.sendTelemetryEvent({ | ||
|
@@ -676,20 +669,6 @@ export class RunningSummarizer implements IDisposable { | |
} | ||
} | ||
|
||
if (this.shouldAbortOnSummaryFailure) { | ||
this.mc.logger.sendTelemetryEvent( | ||
{ | ||
eventName: "ClosingSummarizerOnSummaryStale", | ||
reason, | ||
message: lastResult?.message, | ||
}, | ||
lastResult?.error, | ||
); | ||
|
||
this.stopSummarizerCallback("latestSummaryStateStale"); | ||
this.runtime.closeFn(); | ||
return; | ||
} | ||
// If all attempts failed, log error (with last attempt info) and close the summarizer container | ||
this.mc.logger.sendErrorEvent( | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -669,61 +669,6 @@ describe("Runtime", () => { | |
assert.strictEqual(stopCall, 1); | ||
}); | ||
|
||
it("Should not retry on failure when stopping instead of restarting", async () => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this test to container runtime? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The running summarizer is responsible for doing multiple retries. I don't think we can move this test to the container runtime There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, not this exact test. But testing of similar functionality - container runtime should close instead of fetching snapshot. Unless it already exists. |
||
settings["Fluid.ContainerRuntime.Test.SummarizationRecoveryMethod"] = "restart"; | ||
await startRunningSummarizer(); | ||
await emitNextOp(); | ||
|
||
// too early, should not run yet | ||
await emitNextOp(summaryConfig.maxOps - 1); | ||
assertRunCounts(0, 0, 0); | ||
|
||
// now should run a normal run | ||
await emitNextOp(1); | ||
assertRunCounts(1, 0, 0); | ||
const retryProps1 = { | ||
summarizeCount: 1, | ||
summaryAttemptsPerPhase: 1, | ||
summaryAttempts: 1, | ||
summaryAttemptPhase: 1, | ||
}; | ||
assert( | ||
mockLogger.matchEvents([ | ||
{ eventName: "Running:Summarize_generate", ...retryProps1 }, | ||
{ eventName: "Running:Summarize_Op", ...retryProps1 }, | ||
]), | ||
"unexpected log sequence", | ||
); | ||
|
||
// should not run, because our summary hasn't been acked/nacked yet | ||
await emitNextOp(summaryConfig.maxOps + 1); | ||
assertRunCounts(1, 0, 0); | ||
|
||
// should run with refresh after first nack | ||
await emitNack(); | ||
assertRunCounts(2, 0, 0, "retry1 should be refreshLatestAck"); | ||
const retryProps2 = { | ||
summarizeCount: 1, | ||
summarizerSuccessfulAttempts: 0, | ||
}; | ||
assert( | ||
mockLogger.matchEvents([ | ||
{ | ||
eventName: "Running:Summarize_cancel", | ||
...retryProps1, | ||
reason: getFailMessage("summaryNack"), | ||
}, | ||
{ | ||
eventName: "Running:ClosingSummarizerOnSummaryStale", | ||
...retryProps2, | ||
}, | ||
]), | ||
"unexpected log sequence", | ||
); | ||
assert.strictEqual(stopCall, 1); | ||
assert(mockRuntime.disposed, "runtime should be disposed!"); | ||
}); | ||
|
||
it("Should retry after delay on failures with retryAfter", async () => { | ||
await emitNextOp(); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment there saying what this is used for.