diff --git a/src/yb/tserver/pg_client_session.cc b/src/yb/tserver/pg_client_session.cc index ad8a1206e06d..48a69f3c93d3 100644 --- a/src/yb/tserver/pg_client_session.cc +++ b/src/yb/tserver/pg_client_session.cc @@ -465,10 +465,27 @@ struct PerformData { // Prevent further paging reads from read restart errors. // See the ProcessUsedReadTime(...) function for details. *op_resp.mutable_paging_state()->mutable_read_time() = resp.catalog_read_time(); - } - if (transaction && transaction->isolation() == IsolationLevel::SERIALIZABLE_ISOLATION) { - // Delete read time from paging state since a read time is not used in serializable - // isolation level. + } else { + // Clear read time for the next page here unless absolutely necessary. + // + // Otherwise, if we do not clear read time here, a request for the + // next page with this read time can be sent back by the pg layer. + // Explicit read time in the request clears out existing local limits + // since the pg client session incorrectly believes that this passed + // read time is new. However, paging read time is simply a copy of + // the previous read time. + // + // Rely on + // 1. Either pg client session to set the read time. + // See pg_client_session.cc's SetupSession + // and transaction.cc's SetReadTimeIfNeeded + // and batcher.cc's ExecuteOperations + // 2. Or transaction used read time logic in transaction.cc + // 3. Or plain session's used read time logic in CheckPlainSessionPendingUsedReadTime + // to set the read time for the next page. + // + // Catalog sessions are not handled by the above logic, so + // we set the paging read time above. op_resp.mutable_paging_state()->clear_read_time(); } } diff --git a/src/yb/yql/pgwrapper/pg_local_limit_optimization-test.cc b/src/yb/yql/pgwrapper/pg_local_limit_optimization-test.cc index 20810300925e..6516d376ee10 100644 --- a/src/yb/yql/pgwrapper/pg_local_limit_optimization-test.cc +++ b/src/yb/yql/pgwrapper/pg_local_limit_optimization-test.cc @@ -167,6 +167,10 @@ class PgLocalLimitOptimizationTest : public PgMiniTestBase { // Force the scan in a single page ... ASSERT_OK(read_conn.Execute(Format( "SET yb_fetch_row_limit = $0", 2 * kNumInitialRows))); + } else { + // ... or multiple pages. + ASSERT_OK(read_conn.Execute(Format( + "SET yb_fetch_row_limit = $0", kNumInitialRows / 100))); } PopulateReadConnCache(read_conn); @@ -308,6 +312,26 @@ TEST_F(PgLocalLimitOptimizationTest, SinglePageScan) { InsertRowConcurrentlyWithTableScan(); } +// Before #22821, in a multi-page scan, for each subsequent page scan, +// the read time was set by pggate explicitly based on the used time +// returned by the response for the previous page. +// +// This behavior of overriding the read time also resets the per-tablet +// local limit map. There is no reason for pggate to send read time +// explicitly since the read time does not change across multiple pages. +// +// This test ensures that there is no read restart error just because the +// scan spans multiple pages. Fails without #22821. +TEST_F(PgLocalLimitOptimizationTest, MultiPageScan) { + // Test Config + is_single_tablet_ = true; + is_single_page_scan_ = false; + scan_cmd_ = ScanCmd::kOrdered; + + // Run Test + InsertRowConcurrentlyWithTableScan(); +} + // In a multi-tablet scan, the read time is // 1. Either picked on the local tserver proxy. (This test). // 2. Or picked on the first tserver that the scan hits.