diff --git a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 71ab9451e7a..362855c5af8 100644 --- a/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/ddtrace_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -936,7 +936,7 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) // TODO: This is a placeholder sampling decision strategy. We plan to replace it with a better one soon (e.g. before // beta), and having something here allows us to test the rest of feature, sampling decision aside. - if (((allocation_count % state->allocation_sample_every) == 0)) { + if (allocation_count % state->allocation_sample_every == 0) { // Rescue against any exceptions that happen during sampling safely_call(rescued_sample_allocation, tracepoint_data, state->self_instance); } diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.c b/ext/ddtrace_profiling_native_extension/heap_recorder.c index 4f8ca413015..8effc6b4971 100644 --- a/ext/ddtrace_profiling_native_extension/heap_recorder.c +++ b/ext/ddtrace_profiling_native_extension/heap_recorder.c @@ -22,6 +22,8 @@ struct heap_recorder { // ========================== // Heap Recorder External API // +// WARN: All these APIs should support receiving a NULL heap_recorder, resulting in a noop. +// // WARN: Except for ::heap_recorder_for_each_live_object, we always assume interaction with these APIs // happens under the GVL. // @@ -38,10 +40,26 @@ heap_recorder* heap_recorder_new(void) { } void heap_recorder_free(struct heap_recorder* recorder) { + if (recorder == NULL) { + return; + } + ruby_xfree(recorder); } -void start_heap_allocation_recording(heap_recorder* heap_recorder, VALUE new_obj, unsigned int weight) { +// TODO: Remove when things get implemented +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" + +void heap_recorder_after_fork(heap_recorder *heap_recorder) { + // TODO: Implement +} + +void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj, unsigned int weight) { + if (heap_recorder == NULL) { + return; + } + heap_recorder->active_recording = (partial_heap_recording) { .obj = new_obj, .object_data = (live_object_data) { @@ -50,11 +68,11 @@ void start_heap_allocation_recording(heap_recorder* heap_recorder, VALUE new_obj }; } -// TODO: Remove when things get implemented -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" - void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { + if (heap_recorder == NULL) { + return; + } + partial_heap_recording *active_recording = &heap_recorder->active_recording; VALUE new_obj = active_recording->obj; @@ -72,18 +90,30 @@ void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_pro // WARN: This can get called during Ruby GC. NO HEAP ALLOCATIONS OR EXCEPTIONS ARE ALLOWED. void record_heap_free(heap_recorder *heap_recorder, VALUE obj) { + if (heap_recorder == NULL) { + return; + } + // TODO: Implement } void heap_recorder_flush(heap_recorder *heap_recorder) { + if (heap_recorder == NULL) { + return; + } + // TODO: Implement } void heap_recorder_for_each_live_object( heap_recorder *heap_recorder, - void (*for_each_callback)(heap_recorder_iteration_data stack_data, void *extra_arg), + bool (*for_each_callback)(heap_recorder_iteration_data stack_data, void *extra_arg), void *for_each_callback_extra_arg, bool with_gvl) { + if (heap_recorder == NULL) { + return; + } + // TODO: Implement } diff --git a/ext/ddtrace_profiling_native_extension/heap_recorder.h b/ext/ddtrace_profiling_native_extension/heap_recorder.h index e189e206df2..e7a09a63477 100644 --- a/ext/ddtrace_profiling_native_extension/heap_recorder.h +++ b/ext/ddtrace_profiling_native_extension/heap_recorder.h @@ -9,6 +9,9 @@ // not yet observed are deemed as alive and can be iterated on to produce a // live heap profile. // +// NOTE: All public APIs of heap_recorder support receiving a NULL heap_recorder +// in which case the behaviour will be a noop. +// // WARN: Unless otherwise stated the heap recorder APIs assume calls are done // under the GVL. typedef struct heap_recorder heap_recorder; @@ -38,6 +41,9 @@ heap_recorder* heap_recorder_new(void); // Free a previously initialized heap recorder. void heap_recorder_free(heap_recorder *heap_recorder); +// Do any cleanup needed after forking. +void heap_recorder_after_fork(heap_recorder *heap_recorder); + // Start a heap allocation recording on the heap recorder for a new object. // // This heap allocation recording needs to be ended via ::end_heap_allocation_recording @@ -88,7 +94,8 @@ void heap_recorder_flush(heap_recorder *heap_recorder); // A callback function that shall be called for each live object being tracked // by the heap recorder. Alongside the iteration_data for each live object, // a second argument will be forwarded with the contents of the optional -// for_each_callback_extra_arg. +// for_each_callback_extra_arg. Iteration will continue until the callback +// returns false or we run out of objects. // @param for_each_callback_extra_arg // Optional (NULL if empty) extra data that should be passed to the // callback function alongside the data for each live tracked object. @@ -96,6 +103,6 @@ void heap_recorder_flush(heap_recorder *heap_recorder); // True if we're calling this while holding the GVL, false otherwise. void heap_recorder_for_each_live_object( heap_recorder *heap_recorder, - void (*for_each_callback)(heap_recorder_iteration_data data, void* extra_arg), + bool (*for_each_callback)(heap_recorder_iteration_data data, void* extra_arg), void *for_each_callback_extra_arg, bool with_gvl); diff --git a/ext/ddtrace_profiling_native_extension/libdatadog_helpers.c b/ext/ddtrace_profiling_native_extension/libdatadog_helpers.c index 07f2236decc..f6f8b69b1b9 100644 --- a/ext/ddtrace_profiling_native_extension/libdatadog_helpers.c +++ b/ext/ddtrace_profiling_native_extension/libdatadog_helpers.c @@ -41,29 +41,22 @@ ddog_CharSlice ruby_value_type_to_char_slice(enum ruby_value_type type) { } } -#define MAX_DDOG_ERR_MESSAGE_SIZE 128 -#define DDOG_ERR_MESSAGE_TRUNCATION_SUFFIX "..." +size_t read_ddogerr_string_and_drop(ddog_Error *error, char *string, size_t capacity) { + if (capacity == 0 || string == NULL) { + // short-circuit, we can't write anything + ddog_Error_drop(error); + return 0; + } -void grab_gvl_and_raise_ddogerr_and_drop(const char *while_context, ddog_Error *error) { - char error_msg[MAX_DDOG_ERR_MESSAGE_SIZE]; ddog_CharSlice error_msg_slice = ddog_Error_message(error); - size_t copy_size = error_msg_slice.len; + size_t error_msg_size = error_msg_slice.len; // Account for extra null char for proper cstring - size_t error_msg_size = copy_size + 1; - bool truncated = false; - if (error_msg_size > MAX_DDOG_ERR_MESSAGE_SIZE) { - // if it seems like the error msg won't fit, lets truncate things by limiting - // the copy_size to MAX_SIZE minus the size of the truncation suffix - // (which already includes space for a terminating '\0' due to how sizeof works) - copy_size = MAX_DDOG_ERR_MESSAGE_SIZE - sizeof(DDOG_ERR_MESSAGE_TRUNCATION_SUFFIX); - error_msg_size = MAX_DDOG_ERR_MESSAGE_SIZE; - truncated = true; - } - strncpy(error_msg, error_msg_slice.ptr, copy_size); - if (truncated) { - strcpy(error_msg + copy_size, DDOG_ERR_MESSAGE_TRUNCATION_SUFFIX); + if (error_msg_size >= capacity) { + // Error message too big, lets truncate it to capacity - 1 to allow for extra null at end + error_msg_size = capacity - 1; } - error_msg[error_msg_size - 1] = '\0'; + strncpy(string, error_msg_slice.ptr, error_msg_size); + string[error_msg_size] = '\0'; ddog_Error_drop(error); - grab_gvl_and_raise(rb_eRuntimeError, "Libstreaming error while (%s): %s", while_context, error_msg); + return error_msg_size; } diff --git a/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h b/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h index 9a1734327a8..7d4593fa550 100644 --- a/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h +++ b/ext/ddtrace_profiling_native_extension/libdatadog_helpers.h @@ -24,11 +24,10 @@ inline static VALUE get_error_details_and_drop(ddog_Error *error) { return result; } -// Utility function to be able to easily raise a ddog_Error outside the GVL. -// The message will look like 'Libstreaming error while : ' -NORETURN( - void grab_gvl_and_raise_ddogerr_and_drop(const char *while_context, ddog_Error *error); -); +// Utility function to be able to extract an error cstring from a ddog_Error. +// Returns the amount of characters written to string (which are necessarily +// bounded by capacity - 1 since the string will be null-terminated). +size_t read_ddogerr_string_and_drop(ddog_Error *error, char *string, size_t capacity); // Used for pretty printing this Ruby enum. Returns "T_UNKNOWN_OR_MISSING_RUBY_VALUE_TYPE_ENTRY" for unknown elements. // In practice, there's a few types that the profiler will probably never encounter, but I've added all entries of diff --git a/ext/ddtrace_profiling_native_extension/stack_recorder.c b/ext/ddtrace_profiling_native_extension/stack_recorder.c index 0b431283b93..05ea9076435 100644 --- a/ext/ddtrace_profiling_native_extension/stack_recorder.c +++ b/ext/ddtrace_profiling_native_extension/stack_recorder.c @@ -221,7 +221,7 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE recorder_ static void serializer_set_start_timestamp_for_next_profile(struct stack_recorder_state *state, ddog_Timespec start_time); static VALUE _native_record_endpoint(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE local_root_span_id, VALUE endpoint); static void reset_profile(ddog_prof_Profile *profile, ddog_Timespec *start_time /* Can be null */); -static VALUE _native_track_obj_allocation(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE new_obj, VALUE weight); +static VALUE _native_track_object(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE new_obj, VALUE weight); static VALUE _native_record_obj_free(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE obj); @@ -247,7 +247,7 @@ void stack_recorder_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_slot_one_mutex_locked?", _native_is_slot_one_mutex_locked, 1); rb_define_singleton_method(testing_module, "_native_slot_two_mutex_locked?", _native_is_slot_two_mutex_locked, 1); rb_define_singleton_method(testing_module, "_native_record_endpoint", _native_record_endpoint, 3); - rb_define_singleton_method(testing_module, "_native_track_obj_allocation", _native_track_obj_allocation, 3); + rb_define_singleton_method(testing_module, "_native_track_object", _native_track_object, 3); rb_define_singleton_method(testing_module, "_native_record_obj_free", _native_record_obj_free, 2); ok_symbol = ID2SYM(rb_intern_const("ok")); @@ -283,6 +283,10 @@ static VALUE _native_new(VALUE klass) { VALUE stack_recorder = TypedData_Wrap_Struct(klass, &stack_recorder_typed_data, state); + // NOTE: We initialize this because we want a new recorder to be operational even without initialization and our + // default is everything enabled. However, if during recording initialization it turns out we don't want + // heap samples, we will free and reset heap_recorder to NULL, effectively disabling all behaviour specific + // to heap profiling (all calls to heap_recorder_* with a NULL heap recorder are noops). state->heap_recorder = heap_recorder_new(); // Note: Don't raise exceptions after this point, since it'll lead to libdatadog memory leaking! @@ -415,15 +419,9 @@ static VALUE _native_serialize(DDTRACE_UNUSED VALUE _self, VALUE recorder_instan // Need to do this while still holding on to the Global VM Lock; see comments on method for why serializer_set_start_timestamp_for_next_profile(state, finish_timestamp); - if (state->heap_recorder != NULL) { - // Flush any pending data in the heap recorder prior to doing the iteration during serialization. - // This needs to happen while holding on to the GVL - // TODO: Can this deadlock with the sampler thread due to GVL or Ruby specific things? - // 1. Sampler is in the middle of recording a heap allocation. - // 2. Recorder gets scheduled and tries to acquire the lock, waiting if needed. - // 3. Are we guaranteed that the sampler can make progress? - heap_recorder_flush(state->heap_recorder); - } + // Flush any pending data in the heap recorder prior to doing the iteration during serialization. + // This needs to happen while holding on to the GVL + heap_recorder_flush(state->heap_recorder); // We'll release the Global VM Lock while we're calling serialize, so that the Ruby VM can continue to work while this // is pending @@ -486,9 +484,8 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, metric_values[position_for[WALL_TIME_VALUE_ID]] = values.wall_time_ns; metric_values[position_for[ALLOC_SAMPLES_VALUE_ID]] = values.alloc_samples; - if (values.alloc_samples != 0 && state->heap_recorder != NULL) { - // If we got an allocation sample and our heap recorder is initialized, - // end the heap allocation recording to commit the heap sample. + if (values.alloc_samples != 0) { + // If we got an allocation sample end the heap allocation recording to commit the heap sample. // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can // be fixed with some refactoring but for now this leads to a less impactful change. @@ -515,10 +512,6 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, void track_object(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight) { struct stack_recorder_state *state; TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); - if (state->heap_recorder == NULL) { - // we aren't gathering heap data so we can ignore this - return; - } // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can // be fixed with some refactoring but for now this leads to a less impactful change. @@ -529,10 +522,6 @@ void track_object(VALUE recorder_instance, VALUE new_object, unsigned int sample void record_obj_free(VALUE recorder_instance, VALUE freed_object) { struct stack_recorder_state *state; TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); - if (state->heap_recorder == NULL) { - // we aren't gathering heap data so we can ignore this - return; - } record_heap_free(state->heap_recorder, freed_object); } @@ -551,14 +540,18 @@ void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_ } } +#define MAX_LEN_HEAP_ITERATION_ERROR_MSG 256 + // Heap recorder iteration context allows us access to stack recorder state and profile being serialized // during iteration of heap recorder live objects. typedef struct heap_recorder_iteration_context { struct stack_recorder_state *state; ddog_prof_Profile *profile; + bool error; + char errorMsg[MAX_LEN_HEAP_ITERATION_ERROR_MSG]; } heap_recorder_iteration_context; -static void add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteration_data iteration_data, void *extra_arg) { +static bool add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteration_data iteration_data, void *extra_arg) { heap_recorder_iteration_context *context = (heap_recorder_iteration_context*) extra_arg; int64_t metric_values[ALL_VALUE_TYPES_COUNT] = {0}; @@ -577,16 +570,32 @@ static void add_heap_sample_to_active_profile_without_gvl(heap_recorder_iteratio ); if (result.tag == DDOG_PROF_PROFILE_RESULT_ERR) { - grab_gvl_and_raise_ddogerr_and_drop("recording heap sample", &result.err); + read_ddogerr_string_and_drop(&result.err, context->errorMsg, MAX_LEN_HEAP_ITERATION_ERROR_MSG); + context->error = true; + // By returning false we cancel the iteration + return false; } + + // Keep on iterating to next item! + return true; } static void build_heap_profile_without_gvl(struct stack_recorder_state *state, ddog_prof_Profile *profile) { heap_recorder_iteration_context iteration_context = { .state = state, - .profile = profile + .profile = profile, + .error = false, + .errorMsg = {0}, }; heap_recorder_for_each_live_object(state->heap_recorder, add_heap_sample_to_active_profile_without_gvl, (void*) &iteration_context, false); + if (iteration_context.error) { + // We wait until we're out of the iteration to grab the gvl and raise. This is important because during + // iteration we first grab the records_mutex and raising requires grabbing the GVL. When sampling, we are + // in the opposite situation: we have the GVL and may need to grab the records_mutex for mutation. This + // different ordering can lead to deadlocks. By delaying the raise here until after we no longer hold + // records_mutex, we prevent this different-lock-acquisition-order situation. + grab_gvl_and_raise(rb_eRuntimeError, "Failure during heap profile building: %s", iteration_context.errorMsg); + } } static void *call_serialize_without_gvl(void *call_args) { @@ -594,11 +603,9 @@ static void *call_serialize_without_gvl(void *call_args) { args->profile = serializer_flip_active_and_inactive_slots(args->state); - if (args->state->heap_recorder != NULL) { - // Now that we have the inactive profile with all but heap samples, lets fill it with heap data - // without needing to race with the active sampler - build_heap_profile_without_gvl(args->state, args->profile); - } + // Now that we have the inactive profile with all but heap samples, lets fill it with heap data + // without needing to race with the active sampler + build_heap_profile_without_gvl(args->state, args->profile); // Note: The profile gets reset by the serialize call args->result = ddog_prof_Profile_serialize(args->profile, &args->finish_timestamp, NULL /* duration_nanos is optional */, NULL /* start_time is optional */); @@ -721,8 +728,7 @@ static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE recorder_ reset_profile(&state->slot_one_profile, /* start_time: */ NULL); reset_profile(&state->slot_two_profile, /* start_time: */ NULL); - // After a fork, all tracked live objects should still be alive in the child process so we purposefully DON'T - // reset the heap recorder. The live heap it is tracking is still alive after all. + heap_recorder_after_fork(state->heap_recorder); return Qtrue; } @@ -741,7 +747,7 @@ static VALUE _native_record_endpoint(DDTRACE_UNUSED VALUE _self, VALUE recorder_ return Qtrue; } -static VALUE _native_track_obj_allocation(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE new_obj, VALUE weight) { +static VALUE _native_track_object(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE new_obj, VALUE weight) { ENFORCE_TYPE(weight, T_FIXNUM); track_object(recorder_instance, new_obj, NUM2UINT(weight)); return Qtrue; diff --git a/lib/datadog/core/configuration/settings.rb b/lib/datadog/core/configuration/settings.rb index f7a3baefa38..7050089d0a3 100644 --- a/lib/datadog/core/configuration/settings.rb +++ b/lib/datadog/core/configuration/settings.rb @@ -333,7 +333,7 @@ def initialize(*_) # in summary, this should be supported on Ruby 2.x, 3.1.4+, 3.2.3+ and 3.3.0+. Enabling it on # unsupported Rubies may result in unexpected behaviour, including crashes. # - # @note Heap profiles are not yet GA in the Datadog UI, get in touch if you want to help us test it. + # @note Allocation profiles are not yet GA in the Datadog UI, get in touch if you want to help us test it. # # @default `DD_PROFILING_EXPERIMENTAL_ALLOCATION_ENABLED` environment variable as a boolean, otherwise `false` option :experimental_allocation_enabled do |o| @@ -361,8 +361,6 @@ def initialize(*_) # The lower the value, the more accuracy in allocation and heap tracking but the bigger the overhead. In # particular, a value of 1 will sample ALL allocations. # - # This feature is not supported in all Rubies. Refer to {Datadog::Profiling::Ext::IS_ALLOC_SAMPLING_SUPPORTED} - # # @default `DD_PROFILING_EXPERIMENTAL_ALLOCATION_SAMPLE_RATE` environment variable, otherwise `50`. option :experimental_allocation_sample_rate do |o| o.type :int diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb index 76ccc4a65e3..f986e76a2e0 100644 --- a/lib/datadog/profiling/component.rb +++ b/lib/datadog/profiling/component.rb @@ -147,7 +147,12 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) # https://github.com/ruby/ruby/pull/7464) that makes this crash in any configuration. This bug is # fixed on Ruby versions 3.2.3 and 3.3.0. if RUBY_VERSION.start_with?('3.2.') && RUBY_VERSION < '3.2.3' - raise "Sorry, allocation profiling is not supported in #{RUBY_VERSION}. Please use 3.1.x, 3.2.3 or 3.3.0+" + Datadog.logger.warn( + 'Allocation profiling is not supported in Ruby versions 3.2.0, 3.2.1 and 3.2.2 and will be forcibly '\ + 'disabled. This is due to a VM bug that can lead to crashes (https://bugs.ruby-lang.org/issues/19482). '\ + 'Other Ruby versions do not suffer from this issue.' + ) + return false end # SEVERE - Only with Ractors @@ -160,16 +165,17 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) (RUBY_VERSION.start_with?('3.2.') && RUBY_VERSION < '3.2.3') Datadog.logger.warn( "Current Ruby version (#{RUBY_VERSION}) has a VM bug where enabling allocation profiling while using "\ - 'Ractors may cause unexpected issues, including crashes. This does not happen if Ractors are not used.' + 'Ractors may cause unexpected issues, including crashes (https://bugs.ruby-lang.org/issues/18464). '\ + 'This does not happen if Ractors are not used.' ) # ANNOYANCE - Only with Ractors # On all known versions of Ruby 3.x, due to https://bugs.ruby-lang.org/issues/19112, when a ractor gets # garbage collected, Ruby will disable all active tracepoints, which this feature internally relies on. elsif RUBY_VERSION.start_with?('3.') Datadog.logger.warn( - 'In all known versions of Ruby 3.x, using Ractors may result in allocation profiling unexpectedly stopping. '\ - 'Note that this stop has no impact in your application stability or performance. This does not happen if ' \ - 'Ractors are not used.' + 'In all known versions of Ruby 3.x, using Ractors may result in allocation profiling unexpectedly ' \ + 'stopping (https://bugs.ruby-lang.org/issues/19112). Note that this stop has no impact in your ' \ + 'application stability or performance. This does not happen if Ractors are not used.' ) end diff --git a/spec/datadog/core/configuration/settings_spec.rb b/spec/datadog/core/configuration/settings_spec.rb index 45e542670d2..1a76f5b3393 100644 --- a/spec/datadog/core/configuration/settings_spec.rb +++ b/spec/datadog/core/configuration/settings_spec.rb @@ -497,6 +497,14 @@ end end + describe '#allocation_counting_enabled=' do + it 'logs a warning informing customers this no longer does anything' do + expect(Datadog.logger).to receive(:warn).with(/no longer does anything/) + + settings.profiling.advanced.allocation_counting_enabled = false + end + end + describe '#experimental_allocation_enabled' do subject(:experimental_allocation_enabled) { settings.profiling.advanced.experimental_allocation_enabled } diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index a187497fa96..0f2c8c42cb9 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -12,7 +12,7 @@ let(:allocation_sample_every) { 50 } let(:allocation_profiling_enabled) { false } let(:heap_profiling_enabled) { false } - let(:recorder) { build_stack_recorder(true) } + let(:recorder) { build_stack_recorder(heap_samples_enabled: true) } let(:no_signals_workaround_enabled) { false } let(:timeline_enabled) { false } let(:options) { {} } @@ -67,7 +67,10 @@ wait_until_running end - @skip_cleanup = false + before do + @skip_cleanup = false + end + after do cpu_and_wall_time_worker.stop unless @skip_cleanup end @@ -524,11 +527,10 @@ cpu_and_wall_time_worker.stop - total_heap_sampled_for_test_struct = samples_for_thread(samples_from_pprof(recorder.serialize!), Thread.current) - .select { |s| s.locations.first.lineno == allocation_line && s.locations.first.path == __FILE__ } - .sum { |s| s.values[:'heap-live-samples'] } + relevant_sample = samples_for_thread(samples_from_pprof(recorder.serialize!), Thread.current) + .find { |s| s.locations.first.lineno == allocation_line && s.locations.first.path == __FILE__ } - expect(total_heap_sampled_for_test_struct).to eq test_num_allocated_object + expect(relevant_sample.values[':heap-live-samples']).to eq test_num_allocated_object end context 'but allocation profiling is disabled' do @@ -807,17 +809,21 @@ it { is_expected.to be nil } end - context 'when allocation profiling is enabled' do - let(:allocation_profiling_enabled) { true } + context 'when CpuAndWallTimeWorker has been started' do + before do + cpu_and_wall_time_worker.start + wait_until_running + end - context 'when CpuAndWallTimeWorker has been started' do - before do - cpu_and_wall_time_worker.start - wait_until_running - end + after do + cpu_and_wall_time_worker.stop + end + + context 'when allocation profiling is enabled' do + let(:allocation_profiling_enabled) { true } - after do - cpu_and_wall_time_worker.stop + it 'always returns a >= 0 value' do + expect(described_class._native_allocation_count).to be >= 0 end it 'returns the number of allocations between two calls of the method' do @@ -870,19 +876,10 @@ context 'when allocation profiling is disabled' do let(:allocation_profiling_enabled) { false } + it 'always returns a nil value' do + 100.times { Object.new } - it 'still returns nil after a bunch of allocations' do - # To get the exact expected number of allocations, we run this once before so that Ruby can create and cache all - # it needs to - new_object = proc { Object.new } - 1.times(&new_object) - - before_allocations = described_class._native_allocation_count - 100.times(&new_object) - after_allocations = described_class._native_allocation_count - - expect(before_allocations).to be nil - expect(after_allocations).to be nil + expect(described_class._native_allocation_count).to be nil end end end diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb index df0293e7366..42cef9f03b6 100644 --- a/spec/datadog/profiling/component_spec.rb +++ b/spec/datadog/profiling/component_spec.rb @@ -145,8 +145,16 @@ context "on a Ruby 3 version affected by https://bugs.ruby-lang.org/issues/19482 (#{broken_ruby})" do let(:testing_version) { broken_ruby } - it 'raises an error about lack of support during initialization of a CpuAndWallTimeWorker' do - expect { build_profiler_component }.to raise_error(/not supported.+Please use/) + it 'initializes a CpuAndWallTimeWorker with allocation_profiling forcibly set to false and warns' do + expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to receive(:new).with hash_including( + allocation_profiling_enabled: false, + ) + + expect(Datadog.logger).to receive(:warn).with(/forcibly disabled/) + expect(Datadog.logger).to_not receive(:warn).with(/Ractor/) + expect(Datadog.logger).to_not receive(:warn).with(/experimental allocation profiling/) + + build_profiler_component end end end diff --git a/spec/datadog/profiling/spec_helper.rb b/spec/datadog/profiling/spec_helper.rb index 75f17183e9e..e02c1af213b 100644 --- a/spec/datadog/profiling/spec_helper.rb +++ b/spec/datadog/profiling/spec_helper.rb @@ -74,7 +74,7 @@ def samples_for_thread(samples, thread) # We disable heap_sample collection by default in tests since it requires some extra mocking/ # setup for it to properly work. - def build_stack_recorder(heap_samples_enabled = false) + def build_stack_recorder(heap_samples_enabled: false) Datadog::Profiling::StackRecorder.new( cpu_time_enabled: true, alloc_samples_enabled: true, diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index e6b8c8dd7d1..28ac1c6bd27 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -337,15 +337,15 @@ def sample_types_from(decoded_profile) end describe 'heap samples' do - let(:metric_values1) { { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => 10 } } - let(:metric_values2) { { 'cpu-time' => 102, 'cpu-samples' => 2, 'wall-time' => 790, 'alloc-samples' => 11 } } - let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } let(:sample_rate) { 50 } + let(:metric_values1) { { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate } } + let(:metric_values2) { { 'cpu-time' => 102, 'cpu-samples' => 2, 'wall-time' => 790, 'alloc-samples' => sample_rate } } + let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } let(:a_string) { 'a beautiful string' } let(:another_string) { 'a fearsome string' } - let(:an_array) { [1..10] } - let(:another_array) { [-10..-1] } + let(:an_array) { (1..10).to_a } + let(:another_array) { (-10..-1).to_a } let(:a_hash) { { 'a' => 1, 'b' => '2', 'c' => true } } let(:another_hash) { { 'z' => -1, 'y' => '-2', 'x' => false } } @@ -362,7 +362,7 @@ def sample_types_from(decoded_profile) before do allocated_objects.each_with_index do |obj, i| # Heap sampling currently requires this 2-step process to first pass data about the allocated object... - described_class::Testing._native_track_obj_allocation(stack_recorder, obj, sample_rate) + described_class::Testing._native_track_object(stack_recorder, obj, sample_rate) # ...and then pass data about the allocation stacktrace (with 2 distinct stacktraces) if i.even? Datadog::Profiling::Collectors::Stack::Testing @@ -391,16 +391,31 @@ def sample_types_from(decoded_profile) context 'when enabled' do let(:heap_samples_enabled) { true } + let(:heap_samples) do + samples.select { |s| s.values[:'heap-live-samples'] > 0 } + end + + let(:non_heap_samples) do + samples.select { |s| s.values[:'heap-live-samples'] == 0 } + end + it 'are included in the profile' do pending 'heap_recorder implementation is currently missing' - # We sample from 2 distinct locations but heap samples don't have the same - # labels as the others so they get duped. - expect(samples.size).to eq(4) - expect(samples.sum { |s| s.values[:'heap-live-samples'] || 0 }).to eq( + # We sample from 2 distinct locations + expect(heap_samples.size).to eq(2) + + expect(heap_samples.sum { |s| s.values[:'heap-live-samples'] || 0 }).to eq( [a_string, an_array, a_hash].size * sample_rate ) end + + it 'do not corrupt/overwrite non-heap-samples' do + expect(non_heap_samples.size).to eq(2) + + expect(non_heap_samples.sum { |s| s.values[:'cpu-time'] }).to be > 0 + expect(non_heap_samples.sum { |s| s.values[:'alloc-samples'] }).to eq(allocated_objects.size * sample_rate) + end end end