diff --git a/changelog/@unreleased/pr-661.v2.yml b/changelog/@unreleased/pr-661.v2.yml new file mode 100644 index 000000000..3dff75d97 --- /dev/null +++ b/changelog/@unreleased/pr-661.v2.yml @@ -0,0 +1,6 @@ +type: fix +fix: + description: PinUntilErrorChannel doesn't switch on 429, to unblock transactional + workflows + links: + - https://github.com/palantir/dialogue/pull/661 diff --git a/dialogue-core/src/main/java/com/palantir/dialogue/core/PinUntilErrorChannel.java b/dialogue-core/src/main/java/com/palantir/dialogue/core/PinUntilErrorChannel.java index bcedeb819..aa2f571fc 100644 --- a/dialogue-core/src/main/java/com/palantir/dialogue/core/PinUntilErrorChannel.java +++ b/dialogue-core/src/main/java/com/palantir/dialogue/core/PinUntilErrorChannel.java @@ -129,11 +129,13 @@ public Optional> maybeExecute(Endpoint endpoint, Requ .map(future -> DialogueFutures.addDirectCallback(future, new FutureCallback() { @Override public void onSuccess(Response response) { - if (Responses.isQosStatus(response) || Responses.isServerError(response)) { + // We specifically don't switch 429 responses to support transactional + // workflows where it is important for a large number of requests to all land on the same node, + // even if a couple of them get rate limited in the middle. + if (Responses.isServerError(response) + || (Responses.isQosStatus(response) && !Responses.isTooManyRequests(response))) { OptionalInt next = incrementHostIfNecessary(currentIndex); instrumentation.receivedErrorStatus(currentIndex, channel, response, next); - // TODO(dfox): handle 308 See Other somehow, as we currently don't have a host -> channel - // mapping } else { instrumentation.successfulResponse(currentIndex); } diff --git a/dialogue-core/src/test/java/com/palantir/dialogue/core/PinUntilErrorChannelTest.java b/dialogue-core/src/test/java/com/palantir/dialogue/core/PinUntilErrorChannelTest.java index 4f0dc86a7..e3dfcb296 100644 --- a/dialogue-core/src/test/java/com/palantir/dialogue/core/PinUntilErrorChannelTest.java +++ b/dialogue-core/src/test/java/com/palantir/dialogue/core/PinUntilErrorChannelTest.java @@ -87,12 +87,31 @@ public void channels_are_shuffled_initially_and_successful_requests_stay_on_chan @Test public void various_error_status_codes_cause_node_switch() { - testStatusCausesNodeSwitch(429); + testStatusCausesNodeSwitch(308); + testStatusCausesNodeSwitch(503); for (int errorStatus = 500; errorStatus < 600; errorStatus++) { testStatusCausesNodeSwitch(errorStatus); } } + @Test + void http_429_responses_do_not_cause_node_switch() { + setResponse(channel1, 100); + setResponse(channel2, 204); + + assertThat(IntStream.range(0, 6).map(number -> getCode(pinUntilErrorWithoutReshuffle))) + .describedAs("Should be locked on to channel2 initially") + .contains(204, 204, 204, 204, 204, 204); + + setResponse(channel2, 429); + + assertThat(IntStream.range(0, 6).map(number -> getCode(pinUntilErrorWithoutReshuffle))) + .describedAs("Even after receiving a 429, we must stay pinned on the same channel to support " + + "transactional workflows like the internal atlas-replacement, which rely on all requests " + + "hitting the same node. See PDS-117063 for an example.") + .contains(429, 429, 429, 429, 429, 429); + } + private void testStatusCausesNodeSwitch(int errorStatus) { before(); setResponse(channel1, 100); diff --git a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].png b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].png index 23d8f8d9e..44ccb6e50 100644 --- a/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].png +++ b/simulation/src/test/resources/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2695d875da3f5de08556e4e17c04a024540263e496d6be24623475eae908628 -size 95779 +oid sha256:cc21f62c8e52bc6d2ebd6d6246f3b7085dbba125319fe4e567d8681b99e73435 +size 90873 diff --git a/simulation/src/test/resources/report.md b/simulation/src/test/resources/report.md index ba1b91115..e8dfb36b0 100644 --- a/simulation/src/test/resources/report.md +++ b/simulation/src/test/resources/report.md @@ -17,7 +17,7 @@ live_reloading[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=89.9% client_mean=PT3.8310808S server_cpu=PT1H55M21.66S client_received=2500/2500 server_resps=2500 codes={200=2248, 500=252} live_reloading[UNLIMITED_ROUND_ROBIN].txt: success=60.2% client_mean=PT2.84698S server_cpu=PT1H58M37.45S client_received=2500/2500 server_resps=2500 codes={200=1504, 500=996} one_big_spike[CONCURRENCY_LIMITER_BLACKLIST_ROUND_ROBIN].txt: success=79.0% client_mean=PT1.478050977S server_cpu=PT1M59.71393673S client_received=1000/1000 server_resps=790 codes={200=790, Failed to make a request=210} - one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT1.286733552S server_cpu=PT2M48.75S client_received=1000/1000 server_resps=1125 codes={200=1000} + one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=100.0% client_mean=PT1.135007332S server_cpu=PT2M49.65S client_received=1000/1000 server_resps=1131 codes={200=1000} one_big_spike[CONCURRENCY_LIMITER_ROUND_ROBIN].txt: success=100.0% client_mean=PT0.73895799S server_cpu=PT2M45S client_received=1000/1000 server_resps=1100 codes={200=1000} one_big_spike[UNLIMITED_ROUND_ROBIN].txt: success=100.0% client_mean=PT1.115837367S server_cpu=PT8M3.3S client_received=1000/1000 server_resps=3222 codes={200=1000} one_endpoint_dies_on_each_server[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt: success=64.4% client_mean=PT2.6866096S server_cpu=PT25M client_received=2500/2500 server_resps=2500 codes={200=1611, 500=889} diff --git a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt index 765081eff..a25aa8aed 100644 --- a/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt +++ b/simulation/src/test/resources/txt/one_big_spike[CONCURRENCY_LIMITER_PIN_UNTIL_ERROR].txt @@ -1 +1 @@ -success=100.0% client_mean=PT1.286733552S server_cpu=PT2M48.75S client_received=1000/1000 server_resps=1125 codes={200=1000} \ No newline at end of file +success=100.0% client_mean=PT1.135007332S server_cpu=PT2M49.65S client_received=1000/1000 server_resps=1131 codes={200=1000} \ No newline at end of file