Adding a new rate limiting method ("onAverageEvery(N)") which uses a …

…pseudo random number generator rather than counting the number of logs processed. This is useful in cases where regular rate limiting is too "predictable" (e.g. if application behaviour is very repeatable). RELNOTES=Adding new PRNG based "onAverageEvery(N)" rate limiting method. PiperOrigin-RevId: 520335712
google · Mar 29, 2023 · c25d34e · c25d34e
1 parent bddcc1d
commit c25d34e
Show file tree

Hide file tree

Showing 5 changed files with 271 additions and 20 deletions.
diff --git a/api/src/main/java/com/google/common/flogger/LogContext.java b/api/src/main/java/com/google/common/flogger/LogContext.java
@@ -75,6 +75,13 @@ private Key() {}
     public static final MetadataKey<Integer> LOG_EVERY_N =
         MetadataKey.single("ratelimit_count", Integer.class);
 
+    /**
+     * The key associated with a rate limiting counter for "1-in-N" randomly sampled rate limiting.
+     * The value is set by {@link LoggingApi#onAverageEvery(int)}.
+     */
+    public static final MetadataKey<Integer> LOG_SAMPLE_EVERY_N =
+        MetadataKey.single("sampling_count", Integer.class);
+
     /**
      * The key associated with a rate limiting period for "at most once every N" rate limiting. The
      * value is set by {@link LoggingApi#atMostEvery(int, TimeUnit)}.
@@ -496,11 +503,12 @@ protected final void removeMetadata(MetadataKey<?> key) {
    * <h2>Basic Responsibilities</h2>
    *
    * <p>This method is responsible for:
+   *
    * <ol>
-   * <li>Performing any rate limiting operations specific to the extended API.
-   * <li>Updating per log-site information (e.g. for debug metrics).
-   * <li>Adding any additional metadata to this context.
-   * <li>Returning whether logging should be attempted.
+   *   <li>Performing any rate limiting operations specific to the extended API.
+   *   <li>Updating per log-site information (e.g. for debug metrics).
+   *   <li>Adding any additional metadata to this context.
+   *   <li>Returning whether logging should be attempted.
    * </ol>
    *
    * <p>Implementations of this method must always call {@code super.postProcess()} first with the
@@ -555,10 +563,10 @@ protected final void removeMetadata(MetadataKey<?> key) {
    * post-processing, and no rate limiter state will be updated. This is sometimes desirable, but
    * the API documentation should make it clear to the user as to which behaviour occurs.
    *
-   * For example, level selector methods (such as {@code atInfo()}) return the {@code NoOp} API for
-   * "disabled" log statements, and these have no effect on rate limiter state, and will not update
-   * the "skipped" count. This is fine because controlling logging via log level selection is not
-   * conceptually a form of "rate limiting".
+   * <p>For example, level selector methods (such as {@code atInfo()}) return the {@code NoOp} API
+   * for "disabled" log statements, and these have no effect on rate limiter state, and will not
+   * update the "skipped" count. This is fine because controlling logging via log level selection is
+   * not conceptually a form of "rate limiting".
    *
    * <p>The default implementation of this method enforces the rate limits as set by {@link
    * #every(int)} and {@link #atMostEvery(int, TimeUnit)}.
@@ -573,14 +581,15 @@ protected boolean postProcess(@NullableDecl LogSiteKey logSiteKey) {
       if (logSiteKey != null) {
         // Since the base class postProcess() should be invoked before subclass logic, we can set
         // the initial status here. Subclasses can combine this with other rate limiter statuses by
-        // calling updateRateLimiterStatus() before we get back into shouldLog() .
-        rateLimitStatus =
-            RateLimitStatus.combine(
-                CountingRateLimiter.check(metadata, logSiteKey),
-                DurationRateLimiter.check(metadata, logSiteKey, timestampNanos));
+        // calling updateRateLimiterStatus() before we get back into shouldLog().
+        RateLimitStatus status = DurationRateLimiter.check(metadata, logSiteKey, timestampNanos);
+        status = RateLimitStatus.combine(status, CountingRateLimiter.check(metadata, logSiteKey));
+        status = RateLimitStatus.combine(status, SamplingRateLimiter.check(metadata, logSiteKey));
+        this.rateLimitStatus = status;
+
         // Early exit as soon as we know the log statement is disallowed. A subclass may still do
         // post processing but should never re-enable the log.
-        if (rateLimitStatus == RateLimitStatus.DISALLOW) {
+        if (status == RateLimitStatus.DISALLOW) {
           return false;
         }
       }
@@ -834,16 +843,25 @@ public API withStackTrace(StackSize size) {
 
   @Override
   public final API every(int n) {
+    return everyImpl(Key.LOG_EVERY_N, n, "rate limit");
+  }
+
+  @Override
+  public final API onAverageEvery(int n) {
+    return everyImpl(Key.LOG_SAMPLE_EVERY_N, n, "sampling");
+  }
+
+  private API everyImpl(MetadataKey<Integer> key, int n, String label) {
     // See wasForced() for discussion as to why this occurs before argument checking.
     if (wasForced()) {
       return api();
     }
     if (n <= 0) {
-      throw new IllegalArgumentException("rate limit count must be positive");
+      throw new IllegalArgumentException(label + " count must be positive");
     }
     // 1-in-1 rate limiting is a no-op.
     if (n > 1) {
-      addMetadata(Key.LOG_EVERY_N, n);
+      addMetadata(key, n);
     }
     return api();
   }

diff --git a/api/src/main/java/com/google/common/flogger/LoggingApi.java b/api/src/main/java/com/google/common/flogger/LoggingApi.java
@@ -55,8 +55,9 @@ public interface LoggingApi<API extends LoggingApi<API>> {
 
   /**
    * Modifies the current log statement to be emitted at most one-in-N times. The specified count
-   * must be greater than zero and it is expected, but not required, that it is constant. The first
-   * invocation of any rate-limited log statement will always be emitted.
+   * must be greater than zero and it is expected, but not required, that it is constant. In the
+   * absence of any other rate limiting, this method always allows the first invocation of any log
+   * statement to be emitted.
    *
    * <h3>Notes</h3>
    *
@@ -77,10 +78,36 @@ public interface LoggingApi<API extends LoggingApi<API>> {
    */
   API every(int n);
 
+  /**
+   * Modifies the current log statement to be emitted with likelihood 1 in {@code n}. For example,
+   * inserting {@code onAverageEvery(20)} into a call chain results in approximately 5% as many
+   * messages being emitted as before. Unlike the other rate-limiting options, there is no
+   * guarantee about when the first such message will be emitted, though it becomes highly likely as
+   * the number of calls reaches several times {@code n}.
+   *
+   * <h3>Notes</h3>
+   *
+   * If <em>multiple rate limiters</em> are used for a single log statement, that log statement will
+   * only be emitted once all rate limiters have reached their threshold, and when a log statement
+   * is emitted all the rate limiters are reset. In particular for {@code onAverageEvery(N)} this
+   * means that logs may occurs less frequently than one-in-N if other rate limiters are active.
+   * <p>
+   * When rate limiting is active, a {@code "skipped"} count is added to log statements to indicate
+   * how many logs were disallowed since the last log statement was emitted.
+   * <p>
+   * If this method is called multiple times for a single log statement, the last invocation will
+   * take precedence.
+   *
+   * @param n the factor by which to reduce logging frequency; a value of {@code 1} has no effect.
+   * @throws IllegalArgumentException if {@code n} is not positive.
+   */
+  API onAverageEvery(int n);
+
   /**
    * Modifies the current log statement to be emitted at most once per specified time period. The
    * specified duration must not be negative, and it is expected, but not required, that it is
-   * constant. The first invocation of any rate-limited log statement will always be emitted.
+   * constant.  In the absence of any other rate limiting, this method always allows the first
+   * invocation of any log statement to be emitted.
    * <p>
    * Note that for performance reasons {@code atMostEvery()} is explicitly <em>not</em> intended to
    * perform "proper" rate limiting to produce a limited average rate over many samples.
@@ -109,7 +136,7 @@ public interface LoggingApi<API extends LoggingApi<API>> {
    * <h3>Granularity</h3>
    *
    * Because the implementation of this feature relies on a nanosecond timestamp provided by the
-   * backend, the actual granularity of the underlying clock used may vary. Thus it is possible to
+   * backend, the actual granularity of the underlying clock used may vary, and it is possible to
    * specify a time period smaller than the smallest visible time increment. If this occurs, then
    * the effective rate limit applied to the log statement will be the smallest available time
    * increment. For example, if the system clock granularity is 1 millisecond, and a
@@ -841,6 +868,11 @@ public final API every(int n) {
       return noOp();
     }
 
+    @Override
+    public final API onAverageEvery(int n) {
+      return noOp();
+    }
+
     @Override
     public final API atMostEvery(int n, TimeUnit unit) {
       checkNotNull(unit, "time unit");

diff --git a/api/src/main/java/com/google/common/flogger/SamplingRateLimiter.java b/api/src/main/java/com/google/common/flogger/SamplingRateLimiter.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2023 The Flogger Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.common.flogger;
+
+import static com.google.common.flogger.LogContext.Key.LOG_SAMPLE_EVERY_N;
+import static com.google.common.flogger.RateLimitStatus.DISALLOW;
+
+import com.google.common.flogger.backend.Metadata;
+import java.util.Random;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.checkerframework.checker.nullness.compatqual.NullableDecl;
+
+/**
+ * Rate limiter to support {@code onAverageEvery(N)} functionality.
+ *
+ * <p>Instances of this class are created for each unique {@link LogSiteKey} for which rate limiting
+ * via the {@code LOG_SAMPLE_EVERY_N} metadata key is required. This class implements {@code
+ * RateLimitStatus} as a mechanism for resetting its own state.
+ *
+ * <p>This class is thread safe.
+ */
+final class SamplingRateLimiter extends RateLimitStatus {
+  private static final LogSiteMap<SamplingRateLimiter> map =
+      new LogSiteMap<SamplingRateLimiter>() {
+        @Override
+        protected SamplingRateLimiter initialValue() {
+          return new SamplingRateLimiter();
+        }
+      };
+
+  @NullableDecl
+  static RateLimitStatus check(Metadata metadata, LogSiteKey logSiteKey) {
+    Integer rateLimitCount = metadata.findValue(LOG_SAMPLE_EVERY_N);
+    if (rateLimitCount == null || rateLimitCount <= 0) {
+      // Without valid rate limiter specific metadata, this limiter has no effect.
+      return null;
+    }
+    return map.get(logSiteKey, metadata).sampleOneIn(rateLimitCount);
+  }
+
+  // Even though Random is synchonized, we have to put it in a ThreadLocal to avoid thread
+  // contention. We cannot use ThreadLocalRandom (yet) due to JDK level.
+  private static final ThreadLocal<Random> random = new ThreadLocal<Random>() {
+    @Override
+    protected Random initialValue() {
+      return new Random();
+    }
+  };
+
+  // Visible for testing.
+  final AtomicInteger pendingCount = new AtomicInteger();
+
+  // Visible for testing.
+  SamplingRateLimiter() {}
+
+  RateLimitStatus sampleOneIn(int rateLimitCount) {
+    // Always "roll the dice" and adjust the count if necessary (even if we were already
+    // pending). This means that in the long run we will account for every time we roll a
+    // zero and the number of logs will end up statistically close to 1-in-N (even if at
+    // times they can be "bursty" due to the action of other rate limiting mechanisms).
+    int pending;
+    if (random.get().nextInt(rateLimitCount) == 0) {
+      pending = pendingCount.incrementAndGet();
+    } else {
+      pending = pendingCount.get();
+    }
+    return pending > 0 ? this : DISALLOW;
+  }
+
+  @Override
+  public void reset() {
+    pendingCount.decrementAndGet();
+  }
+}
diff --git a/api/src/test/java/com/google/common/flogger/LogContextTest.java b/api/src/test/java/com/google/common/flogger/LogContextTest.java
@@ -31,6 +31,7 @@
 import com.google.common.base.Splitter;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterators;
+import com.google.common.collect.Range;
 import com.google.common.flogger.DurationRateLimiter.RateLimitPeriod;
 import com.google.common.flogger.LogContext.Key;
 import com.google.common.flogger.context.Tags;
@@ -216,6 +217,40 @@ public void testEveryN() {
     backend.assertLogged(2).metadata().containsUniqueEntry(Key.SKIPPED_LOG_COUNT, 4);
   }
 
+  @Test
+  public void testOnAverageEveryN() {
+    FakeLoggerBackend backend = new FakeLoggerBackend();
+    TestLogger logger = TestLogger.create(backend);
+
+    long startNanos = MILLISECONDS.toNanos(System.currentTimeMillis());
+    // Logging occurs randomly 1-in-5 times over 1000 log statements.
+    for (int millis = 0, count = 0; millis <= 1000; millis += 1) {
+      long timestampNanos = startNanos + MILLISECONDS.toNanos(millis);
+      logger.at(INFO, timestampNanos).onAverageEvery(5).log("Count=%d", count++);
+    }
+
+    // Satisically impossible that we randomly get +/- 100 over 1000 logs.
+    assertThat(backend.getLoggedCount()).isIn(Range.closed(100, 300));
+    backend.assertLogged(0).metadata().containsUniqueEntry(Key.LOG_SAMPLE_EVERY_N, 5);
+
+    // Check the expected count and skipped-count for each log based on the timestamp.
+    int lastLogIndex = -1;
+    for (int n = 0; n < backend.getLoggedCount(); n++) {
+      // The timestamp increases by 1 millisecond each time so we can get the log index from it.
+      long deltaNanos = backend.getLogged(n).getTimestampNanos() - startNanos;
+      int logIndex = (int) (deltaNanos / MILLISECONDS.toNanos(1));
+      backend.assertLogged(n).hasArguments(logIndex);
+      // This works even if lastLogIndex == -1.
+      int skipped = (logIndex - lastLogIndex) - 1;
+      if (skipped == 0) {
+        backend.assertLogged(n).metadata().keys().doesNotContain(Key.SKIPPED_LOG_COUNT);
+      } else {
+        backend.assertLogged(n).metadata().containsUniqueEntry(Key.SKIPPED_LOG_COUNT, skipped);
+      }
+      lastLogIndex = logIndex;
+    }
+  }
+
   @Test
   public void testAtMostEvery() {
     FakeLoggerBackend backend = new FakeLoggerBackend();

diff --git a/api/src/test/java/com/google/common/flogger/SamplingRateLimiterTest.java b/api/src/test/java/com/google/common/flogger/SamplingRateLimiterTest.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2023 The Flogger Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.common.flogger;
+
+import static com.google.common.flogger.LogContext.Key.LOG_SAMPLE_EVERY_N;
+import static com.google.common.truth.Truth.assertThat;
+
+import com.google.common.collect.Range;
+import com.google.common.flogger.backend.Metadata;
+import com.google.common.flogger.testing.FakeLogSite;
+import com.google.common.flogger.testing.FakeMetadata;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class SamplingRateLimiterTest {
+  @Test
+  public void testInvalidCount() {
+    Metadata metadata = new FakeMetadata().add(LOG_SAMPLE_EVERY_N, 0);
+    assertThat(SamplingRateLimiter.check(metadata, FakeLogSite.unique())).isNull();
+  }
+
+  @Test
+  public void testPendingCount() {
+    SamplingRateLimiter limiter = new SamplingRateLimiter();
+    // Initially we are not "pending", so disallow logging for an "impossible" sample rate.
+    assertThat(limiter.pendingCount.get()).isEqualTo(0);
+    assertThat(limiter.sampleOneIn(Integer.MAX_VALUE)).isEqualTo(RateLimitStatus.DISALLOW);
+    for (int i = 0; i < 100; i++) {
+      RateLimitStatus unused = limiter.sampleOneIn(5);
+    }
+    // Statistically we should be pending at least once.
+    int pendingCount = limiter.pendingCount.get();
+    assertThat(pendingCount).isGreaterThan(0);
+    // Now we are pending, we allow logging even for an "impossible" sample rate.
+    assertThat(limiter.sampleOneIn(Integer.MAX_VALUE)).isNotEqualTo(RateLimitStatus.DISALLOW);
+    limiter.reset();
+    assertThat(limiter.pendingCount.get()).isEqualTo(pendingCount - 1);
+  }
+
+  @Test
+  public void testSamplingRate() {
+    // Chance is less than one-millionth of 1% that this will fail spuriously.
+    Metadata metadata = new FakeMetadata().add(LOG_SAMPLE_EVERY_N, 2);
+    assertThat(countNSamples(1000, metadata)).isIn(Range.closed(400, 600));
+
+    // Expected average is 20 logs out of 1000. Seeing 0 or > 100 is enormously unlikely.
+    metadata = new FakeMetadata().add(LOG_SAMPLE_EVERY_N, 50);
+    assertThat(countNSamples(1000, metadata)).isIn(Range.closed(1, 100));
+  }
+
+  private static int countNSamples(int n, Metadata metadata) {
+    LogSite logSite = FakeLogSite.unique();
+    int sampled = 0;
+    while (n-- > 0) {
+      if (RateLimitStatus.checkStatus(
+          SamplingRateLimiter.check(metadata, logSite), logSite, metadata) >= 0) {
+        sampled++;
+      }
+    }
+    return sampled;
+  }
+}