Skip to content

Commit

Permalink
PinUntilErrorChannel doesn't switch on 429 (#661)
Browse files Browse the repository at this point in the history
PinUntilErrorChannel doesn't switch on 429, to unblock transactional workflows
  • Loading branch information
iamdanfox authored Apr 17, 2020
1 parent b79a078 commit e6ec9b2
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 8 deletions.
6 changes: 6 additions & 0 deletions changelog/@unreleased/pr-661.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
type: fix
fix:
description: PinUntilErrorChannel doesn't switch on 429, to unblock transactional
workflows
links:
- https://github.com/palantir/dialogue/pull/661
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,13 @@ public Optional<ListenableFuture<Response>> maybeExecute(Endpoint endpoint, Requ
.map(future -> DialogueFutures.addDirectCallback(future, new FutureCallback<Response>() {
@Override
public void onSuccess(Response response) {
if (Responses.isQosStatus(response) || Responses.isServerError(response)) {
// We specifically don't switch 429 responses to support transactional
// workflows where it is important for a large number of requests to all land on the same node,
// even if a couple of them get rate limited in the middle.
if (Responses.isServerError(response)
|| (Responses.isQosStatus(response) && !Responses.isTooManyRequests(response))) {
OptionalInt next = incrementHostIfNecessary(currentIndex);
instrumentation.receivedErrorStatus(currentIndex, channel, response, next);
// TODO(dfox): handle 308 See Other somehow, as we currently don't have a host -> channel
// mapping
} else {
instrumentation.successfulResponse(currentIndex);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,31 @@ public void channels_are_shuffled_initially_and_successful_requests_stay_on_chan

@Test
public void various_error_status_codes_cause_node_switch() {
testStatusCausesNodeSwitch(429);
testStatusCausesNodeSwitch(308);
testStatusCausesNodeSwitch(503);
for (int errorStatus = 500; errorStatus < 600; errorStatus++) {
testStatusCausesNodeSwitch(errorStatus);
}
}

@Test
void http_429_responses_do_not_cause_node_switch() {
setResponse(channel1, 100);
setResponse(channel2, 204);

assertThat(IntStream.range(0, 6).map(number -> getCode(pinUntilErrorWithoutReshuffle)))
.describedAs("Should be locked on to channel2 initially")
.contains(204, 204, 204, 204, 204, 204);

setResponse(channel2, 429);

assertThat(IntStream.range(0, 6).map(number -> getCode(pinUntilErrorWithoutReshuffle)))
.describedAs("Even after receiving a 429, we must stay pinned on the same channel to support "
+ "transactional workflows like the internal atlas-replacement, which rely on all requests "
+ "hitting the same node. See PDS-117063 for an example.")
.contains(429, 429, 429, 429, 429, 429);
}

private void testStatusCausesNodeSwitch(int errorStatus) {
before();
setResponse(channel1, 100);
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion simulation/src/test/resources/report.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=89.9% client_mean=PT3.8310808S server_cpu=PT1H55M21.66S client_received=2500/2500 server_resps=2500 codes={200=2248, 500=252}
live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=60.2% client_mean=PT2.84698S server_cpu=PT1H58M37.45S client_received=2500/2500 server_resps=2500 codes={200=1504, 500=996}
one_big_spike[CONCURRENCY_LIMITER_BLACKLIST_ROUND_ROBIN].txt: success=79.0% client_mean=PT1.478050977S server_cpu=PT1M59.71393673S client_received=1000/1000 server_resps=790 codes={200=790, Failed to make a request=210}
one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT1.286733552S server_cpu=PT2M48.75S client_received=1000/1000 server_resps=1125 codes={200=1000}
one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT1.135007332S server_cpu=PT2M49.65S client_received=1000/1000 server_resps=1131 codes={200=1000}
one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.73895799S server_cpu=PT2M45S client_received=1000/1000 server_resps=1100 codes={200=1000}
one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.115837367S server_cpu=PT8M3.3S client_received=1000/1000 server_resps=3222 codes={200=1000}
one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.4% client_mean=PT2.6866096S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1611, 500=889}
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
success=100.0% client_mean=PT1.286733552S server_cpu=PT2M48.75S client_received=1000/1000 server_resps=1125 codes={200=1000}
success=100.0% client_mean=PT1.135007332S server_cpu=PT2M49.65S client_received=1000/1000 server_resps=1131 codes={200=1000}

0 comments on commit e6ec9b2

Please sign in to comment.