From 446f2ad4b0f3d0a7e6eba8d76c29e85253502429 Mon Sep 17 00:00:00 2001
From: Ivo Anjo <ivo.anjo@datadoghq.com>
Date: Fri, 19 Jul 2024 16:52:25 +0100
Subject: [PATCH] [PROF-10201] Reduce allocation profiling overhead by using
 coarse timestamps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**What does this PR do?**

This PR reduces the allocation profiling overhead (or: optimizes
allocation profiling :D ) by using coarse timestamps on the
`on_newobj_event` hot path.

**Motivation:**

When allocation profiling is enabled, the profiler gets called for
almost every object allocated in the Ruby VM. Thus, this code path is
extremely sensitive: the less work we can do before we return control
over to Ruby, the less impact allocation profiling has on the
application.

The dynamic sampling rate mechanism we employ takes the current
timestamp as an input, to decide if "enough time" has elapsed since it
last readjusted itself. But "enough time" right now is *one second*
and thus we can get away with using `CLOCK_MONOTONIC_COARSE` on Linux
which is noticeably cheaper than the regular `CLOCK_MONOTONIC`.

**Additional Notes:**

Enabling the use of the monotonic clock _sometimes_ on the discrete
dynamic sampler required it to "spill" some of its guts out to the
caller, so that the caller could correctly use the coarse clock on the
hot path.

**How to test the change?**

Here's my experiment to compare three different clock sources I
evaluated:

```c++
 // Build with
 // `g++ time_sources.cpp -o time_sources -lbenchmark -lpthread`
 // where benchmark is <https://github.com/google/benchmark> aka
 // `apt install libbenchmark1 libbenchmark-dev` on ubuntu/debian

 #include <benchmark/benchmark.h>
 #include <x86intrin.h> // For __rdtsc
 #include <ctime>       // For clock_gettime

static void BM_RDTSC(benchmark::State& state) {
    for (auto _ : state) {
        benchmark::DoNotOptimize(__rdtsc());
    }
}

static void BM_ClockMonotonic(benchmark::State& state) {
    timespec ts;
    for (auto _ : state) {
        clock_gettime(CLOCK_MONOTONIC, &ts);
        benchmark::DoNotOptimize(ts);
    }
}

static void BM_ClockMonotonicCoarse(benchmark::State& state) {
    timespec ts;
    for (auto _ : state) {
        clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
        benchmark::DoNotOptimize(ts);
    }
}

BENCHMARK(BM_RDTSC);
BENCHMARK(BM_ClockMonotonic);
BENCHMARK(BM_ClockMonotonicCoarse);

BENCHMARK_MAIN();
```

Results on my machine:

```
./time_sources --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
2024-07-19T10:48:20+01:00
Running ./time_sources
Run on (20 X 4900 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x10)
  L1 Instruction 32 KiB (x10)
  L2 Unified 1280 KiB (x10)
  L3 Unified 24576 KiB (x1)
Load Average: 1.23, 1.30, 1.11
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-------------------------------------------------------------------------
Benchmark                               Time             CPU   Iterations
-------------------------------------------------------------------------
BM_RDTSC_mean                        5.52 ns         5.52 ns           10
BM_RDTSC_median                      5.44 ns         5.44 ns           10
BM_RDTSC_stddev                     0.148 ns        0.147 ns           10
BM_RDTSC_cv                          2.67 %          2.67 %            10
BM_ClockMonotonic_mean               15.8 ns         15.8 ns           10
BM_ClockMonotonic_median             15.4 ns         15.4 ns           10
BM_ClockMonotonic_stddev             1.07 ns         1.07 ns           10
BM_ClockMonotonic_cv                 6.77 %          6.77 %            10
BM_ClockMonotonicCoarse_mean         5.92 ns         5.92 ns           10
BM_ClockMonotonicCoarse_median       5.93 ns         5.93 ns           10
BM_ClockMonotonicCoarse_stddev      0.041 ns        0.041 ns           10
BM_ClockMonotonicCoarse_cv           0.68 %          0.68 %            10
```

and here's the result of running `benchmarks/profiler_allocation.rb`
comparing master to this branch:

```
ruby 2.7.7p221 (2022-11-24 revision 168ec2b1e5) [x86_64-linux]
Warming up --------------------------------------
Allocations (baseline)   1.431M i/100ms
Calculating -------------------------------------
Allocations (baseline)   14.370M (± 2.0%) i/s -    144.541M in  10.062635s

Warming up --------------------------------------
Allocations (master)     1.014M i/100ms
Calculating -------------------------------------
Allocations (master)     10.165M (± 1.0%) i/s -    102.390M in  10.074151s

Warming up --------------------------------------
Allocations (coarse)     1.179M i/100ms
Calculating -------------------------------------
Allocations (coarse)     11.495M (± 2.5%) i/s -    115.573M in  10.059971s

Comparison:
Allocations (baseline): 14369960.1 i/s
Allocations (coarse): 11495418.2 i/s - 1.25x  slower
Allocations (master): 10164615.7 i/s - 1.41x  slower
```

I've specifically used Ruby 2.7 for this comparison since this benchmark
had a lot more variance (including baseline) on latter Rubies, and I
wanted to isolate the specific changes to this code path.
---
 .../collectors_cpu_and_wall_time_worker.c     | 53 ++++++++++++++----
 .../collectors_discrete_dynamic_sampler.c     | 55 +++++++++++++------
 .../collectors_discrete_dynamic_sampler.h     | 38 +++++++++++--
 .../time_helpers.h                            | 10 ++--
 4 files changed, 121 insertions(+), 35 deletions(-)
diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
index bde8fc7281c..30a05e4f55a 100644
--- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
+++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -1045,8 +1045,27 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
   return are_allocations_being_tracked ? ULL2NUM(allocation_count) : Qnil;
 }
 
+#define HANDLE_CLOCK_FAILURE(call) ({ \
+    long _result = (call); \
+    if (_result == 0) { \
+        delayed_error(state, ERR_CLOCK_FAIL); \
+        return; \
+    } \
+    _result; \
+})
+
 // Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
 // when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
+//
+// When allocation sampling is enabled, this function gets called for almost all* objects allocated by the Ruby VM.
+// (*In some weird cases the VM may skip this tracepoint.)
+//
+// At a high level, there's two paths through this function:
+// 1. should_sample == false -> return
+// 2. should_sample == true -> sample
+//
+// On big applications, path 1. is the hottest, since we don't sample every option. So it's quite important for it to
+// be as fast as possible.
 static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
   // Update thread-local allocation count
   if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
@@ -1061,25 +1080,39 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
   // and disabled before it is cleared, but just in case...
   if (state == NULL) return;
 
-  // In a few cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
+  // In rare cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
   // sample, so we just return early
   if (state->during_sample) {
     state->stats.allocations_during_sample++;
     return;
   }
 
-  if (state->dynamic_sampling_rate_enabled) {
-    long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
-    if (now == 0) {
-      delayed_error(state, ERR_CLOCK_FAIL);
-      return;
-    }
-    if (!discrete_dynamic_sampler_should_sample(&state->allocation_sampler, now)) {
-      state->stats.allocation_skipped++;
-      return;
+  // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false
+  if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) {
+    state->stats.allocation_skipped++;
+
+    coarse_instant now = monotonic_coarse_wall_time_now_ns();
+    HANDLE_CLOCK_FAILURE(now.timestamp_ns);
+
+    bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->allocation_sampler, now);
+    if (RB_UNLIKELY(needs_readjust)) {
+      // We rarely readjust, so this is a cold path
+      // Also, while above we used the cheaper monotonic_coarse, for this call we want the regular monotonic call,
+      // which is why we end up getting time "again".
+      discrete_dynamic_sampler_readjust(
+        &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
+      );
     }
+
+    return;
   }
 
+  // From here on, we've decided to go ahead with the sample, which is way less common than skipping it
+
+  discrete_dynamic_sampler_before_sample(
+    &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
+  );
+
   // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
   // invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
   // to keep this here for consistency -- every call to the thread context (other than the special gc calls which are
diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c
index d5ae3aedc00..2db303e07f4 100644
--- a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c
+++ b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c
@@ -21,6 +21,7 @@
 #define EMA_SMOOTHING_FACTOR 0.6
 
 static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns);
+inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now);
 
 void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns) {
   sampler->debug_name = debug_name;
@@ -56,25 +57,25 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
   return discrete_dynamic_sampler_reset(sampler, now_ns);
 }
 
-bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns) {
+// NOTE: See header for an explanation of when this should get used
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler) {
   // For efficiency reasons we don't do true random sampling but rather systematic
   // sampling following a sample interval/skip. This can be biased and hide patterns
   // but the dynamic interval and rather nondeterministic pattern of allocations in
   // most real applications should help reduce the bias impact.
   sampler->events_since_last_sample++;
   sampler->events_since_last_readjustment++;
-  bool should_sample = sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
 
-  if (should_sample) {
-    sampler->sample_start_time_ns = now_ns;
-  } else {
-    // check if we should readjust our sampler after this event, even if we didn't sample it
-    maybe_readjust(sampler, now_ns);
-  }
+  return sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
+}
 
-  return should_sample;
+// NOTE: See header for an explanation of when this should get used
+void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns) {
+  sampler->sample_start_time_ns = now_ns;
 }
 
+// NOTE: See header for an explanation of when this should get used
 long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns) {
   long last_sampling_time_ns = sampler->sample_start_time_ns == 0 ? 0 : long_max_of(0, now_ns - sampler->sample_start_time_ns);
   sampler->samples_since_last_readjustment++;
@@ -95,6 +96,11 @@ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sample
   return sampler->events_since_last_sample;
 }
 
+// NOTE: See header for an explanation of when this should get used
+bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now) {
+  return should_readjust(sampler, now);
+}
+
 static double ewma_adj_window(double latest_value, double avg, long current_window_time_ns, bool is_first) {
   if (is_first) {
     return latest_value;
@@ -110,18 +116,26 @@ static double ewma_adj_window(double latest_value, double avg, long current_wind
 }
 
 static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
-  long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now_ns - sampler->last_readjust_time_ns;
+  if (should_readjust(sampler, to_coarse_instant(now_ns))) discrete_dynamic_sampler_readjust(sampler, now_ns);
+}
+
+inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now) {
+  long this_window_time_ns =
+    sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now.timestamp_ns - sampler->last_readjust_time_ns;
 
   bool should_readjust_based_on_time = this_window_time_ns >= ADJUSTMENT_WINDOW_NS;
   bool should_readjust_based_on_samples = sampler->samples_since_last_readjustment >= ADJUSTMENT_WINDOW_SAMPLES;
 
-  if (!should_readjust_based_on_time && !should_readjust_based_on_samples) {
-    // not enough time or samples have passed to perform a readjustment
-    return;
-  }
+  return should_readjust_based_on_time || should_readjust_based_on_samples;
+}
 
-  if (this_window_time_ns == 0) {
-    // should not be possible given previous condition but lets protect against div by 0 below.
+// NOTE: This method ASSUMES should_readjust == true
+// NOTE: See header for an explanation of when this should get used
+void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
+  long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now_ns - sampler->last_readjust_time_ns;
+
+  if (this_window_time_ns <= 0) {
+    // should not be possible given should_readjust but lets protect against div by 0 below.
     return;
   }
 
@@ -406,7 +420,14 @@ VALUE _native_should_sample(VALUE self, VALUE now_ns) {
   sampler_state *state;
   TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
 
-  return discrete_dynamic_sampler_should_sample(&state->sampler, NUM2LONG(now_ns)) ? Qtrue : Qfalse;
+  if (discrete_dynamic_sampler_should_sample(&state->sampler)) {
+    discrete_dynamic_sampler_before_sample(&state->sampler, NUM2LONG(now_ns));
+    return Qtrue;
+  } else {
+    bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->sampler, to_coarse_instant(NUM2LONG(now_ns)));
+    if (needs_readjust) discrete_dynamic_sampler_readjust(&state->sampler, NUM2LONG(now_ns));
+    return Qfalse;
+  }
 }
 
 VALUE _native_after_sample(VALUE self, VALUE now_ns) {
diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h
index 374dc453b6d..7e1edfb5c42 100644
--- a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h
+++ b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h
@@ -5,6 +5,8 @@
 
 #include <ruby.h>
 
+#include "time_helpers.h"
+
 // A sampler that will sample discrete events based on the overhead of their
 // sampling.
 //
@@ -62,7 +64,6 @@ typedef struct discrete_dynamic_sampler {
   unsigned long sampling_time_clamps;
 } discrete_dynamic_sampler;
 
-
 // Init a new sampler with sane defaults.
 void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns);
 
@@ -80,9 +81,38 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
 // @return True if the event associated with this decision should be sampled, false
 //         otherwise.
 //
-// NOTE: If true is returned we implicitly assume the start of a sampling operation
-//       and it is expected that a follow-up after_sample call is issued.
-bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns);
+// IMPORTANT: A call to this method MUST be followed by a call to either
+//            `discrete_dynamic_sampler_before_sample` if return is `true` or
+//            `discrete_dynamic_sampler_skipped_sample` if return is `false`.
+//
+// In the past, this method took care of before_sample/skipped_sample/readjust as well.
+// We've had to split it up because we wanted to both use coarse time and regular monotonic time depending on the
+// situation, but also wanted time to come as an argument from the outside.
+//
+// TL;DR here's how they should be used as Ruby code:
+// if discrete_dynamic_sampler_should_sample
+//   discrete_dynamic_sampler_before_sample(monotonic_wall_time_now_ns())
+// else
+//   needs_readjust = discrete_dynamic_sampler_skipped_sample(monotonic_coarse_wall_time_now_ns())
+//   discrete_dynamic_sampler_readjust(monotonic_wall_time_now_ns()) if needs_readjust
+// end
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler);
+
+// Signal the start of a sampling operation.
+// MUST be called after `discrete_dynamic_sampler_should_sample` returns `true`.
+void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns);
+
+// Signals that sampling did not happen
+// MUST be called after `discrete_dynamic_sampler_skipped_sample` returns `false`.
+//
+// @return True if the sampler needs to be readjusted.
+//
+// IMPORTANT: A call to this method MUST be followed by a call to `discrete_dynamic_sampler_readjust` if return is `true`.
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now);
+
+void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns);
 
 // Signal the end of a sampling operation.
 //
diff --git a/ext/datadog_profiling_native_extension/time_helpers.h b/ext/datadog_profiling_native_extension/time_helpers.h
index 7ba73d5f63a..1475ed3aad8 100644
--- a/ext/datadog_profiling_native_extension/time_helpers.h
+++ b/ext/datadog_profiling_native_extension/time_helpers.h
@@ -37,17 +37,19 @@ inline long system_epoch_time_now_ns(raise_on_failure_setting raise_on_failure)
 
 // Coarse instants use CLOCK_MONOTONIC_COARSE on Linux which is expected to provide resolution in the millisecond range:
 // https://docs.redhat.com/en/documentation/red_hat_enterprise_linux_for_real_time/7/html/reference_guide/sect-posix_clocks#Using_clock_getres_to_compare_clock_resolution
-// We introduce here a separate type for it, so as to make it harder/more explicit when these timestamps are used
+// We introduce here a separate type for it, so as to make it harder to misuse/more explicit when these timestamps are used
 
 typedef struct coarse_instant {
   long timestamp_ns;
 } coarse_instant;
 
+inline coarse_instant to_coarse_instant(long timestamp_ns) { return (coarse_instant) {.timestamp_ns = timestamp_ns}; }
+
 inline coarse_instant monotonic_coarse_wall_time_now_ns(void) {
-  #ifdef HAVE_CLOCK_MONOTONIC_COARSE // Linux
-    return (coarse_instant) {.timestamp_ns = retrieve_clock_as_ns(CLOCK_MONOTONIC_COARSE, DO_NOT_RAISE_ON_FAILURE)};
+ #ifdef HAVE_CLOCK_MONOTONIC_COARSE // Linux
+    return to_coarse_instant(retrieve_clock_as_ns(CLOCK_MONOTONIC_COARSE, DO_NOT_RAISE_ON_FAILURE));
   #else // macOS
-    return (coarse_instant) {.timestamp_ns = retrieve_clock_as_ns(CLOCK_MONOTONIC, DO_NOT_RAISE_ON_FAILURE)};
+    return to_coarse_instant(retrieve_clock_as_ns(CLOCK_MONOTONIC, DO_NOT_RAISE_ON_FAILURE));
   #endif
 }