diff --git a/frontend/__snapshots__/scenes-app-pipeline--pipeline-node-new-hog-function--light.png b/frontend/__snapshots__/scenes-app-pipeline--pipeline-node-new-hog-function--light.png index 1eaddf28cc8a7..770be28c1a0e5 100644 Binary files a/frontend/__snapshots__/scenes-app-pipeline--pipeline-node-new-hog-function--light.png and b/frontend/__snapshots__/scenes-app-pipeline--pipeline-node-new-hog-function--light.png differ diff --git a/frontend/src/scenes/experiments/experimentLogic.tsx b/frontend/src/scenes/experiments/experimentLogic.tsx index 9cca9e3bcc19a..35eebfb5904ea 100644 --- a/frontend/src/scenes/experiments/experimentLogic.tsx +++ b/frontend/src/scenes/experiments/experimentLogic.tsx @@ -1306,8 +1306,15 @@ export const experimentLogic = kea([ }, ], significanceDetails: [ - (s) => [s.metricResults], - (metricResults: (CachedExperimentFunnelsQueryResponse | CachedExperimentTrendsQueryResponse | null)[]) => + (s) => [s.metricResults, s.experimentStatsVersion], + ( + metricResults: ( + | CachedExperimentFunnelsQueryResponse + | CachedExperimentTrendsQueryResponse + | null + )[], + experimentStatsVersion: number + ) => (metricIndex: number = 0): string => { const results = metricResults?.[metricIndex] @@ -1324,6 +1331,9 @@ export const experimentLogic = kea([ } if (results?.significance_code === ExperimentSignificanceCode.LowWinProbability) { + if (experimentStatsVersion === 2) { + return 'This is because no variant (control or test) has a win probability higher than 90%.' + } return 'This is because the win probability of all test variants combined is less than 90%.' } diff --git a/posthog/hogql_queries/experiments/funnels_statistics_v2.py b/posthog/hogql_queries/experiments/funnels_statistics_v2.py index 73d23f9924a08..a61c572d8d393 100644 --- a/posthog/hogql_queries/experiments/funnels_statistics_v2.py +++ b/posthog/hogql_queries/experiments/funnels_statistics_v2.py @@ -37,9 +37,9 @@ def calculate_probabilities_v2( Returns: -------- list[float] - A list of probabilities that sum to 1, where: - - The first element is the probability that the control variant is the best - - Subsequent elements are the probabilities that each test variant is the best + A list of probabilities where each element represents: + - index 0: probability control beats the best test variant + - index i>0: probability test variant i-1 beats control Notes: ------ @@ -70,10 +70,20 @@ def calculate_probabilities_v2( samples.append(variant_samples) samples_array = np.array(samples) - # Calculate probability of each variant being the best probabilities = [] - for i in range(len(all_variants)): - probability = (samples_array[i] == np.max(samples_array, axis=0)).mean() + control_samples = samples_array[0] # Control is always first variant + + # Find the best test variant at each sample point + test_variants_samples = samples_array[1:] + best_variant_samples = np.max(test_variants_samples, axis=0) + + # Control's probability is of being better than the best test variant + control_prob = np.mean(control_samples >= best_variant_samples) + probabilities.append(float(control_prob)) + + # For each test variant, calculate probability of beating control + for i in range(1, len(all_variants)): + probability = np.mean(samples_array[i] > control_samples) probabilities.append(float(probability)) return probabilities diff --git a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py index bd70abf456162..86c16818aebee 100644 --- a/posthog/hogql_queries/experiments/test/test_funnels_statistics.py +++ b/posthog/hogql_queries/experiments/test/test_funnels_statistics.py @@ -173,6 +173,35 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) + def test_many_variants_win_probabilty_compared_to_control(self): + """Test with multiple variants, win probability compared to control""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + # test_a is worse than control + # test_b is best overall + # test_c is slightly better than control + control = create_variant("control", success_count=100, failure_count=900) # 10% conversion + test_a = create_variant("test_a", success_count=80, failure_count=920) # 8% conversion + test_b = create_variant("test_b", success_count=150, failure_count=850) # 15% conversion + test_c = create_variant("test_c", success_count=110, failure_count=890) # 11% conversion + + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4) + if stats_version == 2: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.76, delta=0.05) + else: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05) + + self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) def test_insufficient_sample_size(self): """Test with sample size below threshold""" diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py index b3da850d35c57..6abd018542a3f 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_continuous.py @@ -343,9 +343,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.assertEqual(len(probabilities), 4) if stats_version == 2: - self.assertTrue(probabilities[2] > 0.9) - self.assertTrue(probabilities[1] < 0.1) self.assertTrue(probabilities[0] < 0.1) + self.assertTrue(probabilities[1] > 0.9) + self.assertTrue(probabilities[2] > 0.9) + self.assertTrue(probabilities[3] > 0.9) + self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) self.assertEqual(p_value, 0) @@ -389,6 +391,55 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) + def test_many_variants_win_probabilty_compared_to_control(self): + """Test with multiple variants, win probability compared to control""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 1000 + control = create_variant( + "control", + total_sum=100.0 * control_absolute_exposure, + exposure=1, + absolute_exposure=control_absolute_exposure, + ) + test_a_absolute_exposure = 1000 + test_a = create_variant( + "test_a", + total_sum=85.0 * test_a_absolute_exposure, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 1000 + test_b = create_variant( + "test_b", + total_sum=150.0 * test_b_absolute_exposure, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 1000 + test_c = create_variant( + "test_c", + total_sum=110.0 * test_c_absolute_exposure, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4) + if stats_version == 2: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.99, delta=0.05) + else: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05) + + self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) def test_insufficient_sample_size(self): """Test with sample size below threshold""" diff --git a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py index 67c569407c849..97436ff5584cf 100644 --- a/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py +++ b/posthog/hogql_queries/experiments/test/test_trends_statistics_count.py @@ -247,13 +247,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca intervals = calculate_credible_intervals([control, test_a, test_b, test_c]) self.assertEqual(len(probabilities), 4) - self.assertTrue(probabilities[2] > 0.9) # test_b should be winning - self.assertTrue(probabilities[1] < 0.1) # test_a should be losing - self.assertTrue(probabilities[0] < 0.1) # control should be losing self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT) if stats_version == 2: + self.assertTrue(probabilities[0] < 0.1) # control is losing + self.assertTrue(probabilities[1] > 0.7) # test_a beats control, but less confidently + self.assertTrue(probabilities[2] > 0.9) # test_b beats control + self.assertTrue(probabilities[3] > 0.9) # test_c beats control self.assertEqual(p_value, 0) else: + self.assertTrue(probabilities[0] < 0.1) # control should be losing + self.assertTrue(probabilities[1] < 0.1) # test_a should be losing + self.assertTrue(probabilities[2] > 0.9) # test_b should be winning + self.assertTrue(probabilities[3] < 0.1) # test_c should be losing self.assertLess(p_value, 0.001) # Control at 10% @@ -274,6 +279,51 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) + def test_many_variants_win_probabilty_compared_to_control(self): + """Test with multiple variants, win probability compared to control""" + + def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals): + control_absolute_exposure = 1000 + control = create_variant("control", count=100, exposure=1, absolute_exposure=control_absolute_exposure) + test_a_absolute_exposure = 1000 + test_a = create_variant( + "test_a", + count=85, + exposure=test_a_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_a_absolute_exposure, + ) + test_b_absolute_exposure = 1000 + test_b = create_variant( + "test_b", + count=150, + exposure=test_b_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_b_absolute_exposure, + ) + test_c_absolute_exposure = 1000 + test_c = create_variant( + "test_c", + count=110, + exposure=test_c_absolute_exposure / control_absolute_exposure, + absolute_exposure=test_c_absolute_exposure, + ) + + probabilities = calculate_probabilities(control, [test_a, test_b, test_c]) + + self.assertEqual(len(probabilities), 4) + if stats_version == 2: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0.13, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.75, delta=0.05) + else: + self.assertAlmostEqual(probabilities[0], 0, delta=0.05) + self.assertAlmostEqual(probabilities[1], 0, delta=0.05) + self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05) + self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05) + + self.run_test_for_both_implementations(run_test) + @flaky(max_runs=5, min_passes=1) def test_real_world_data_1(self): """Test with multiple variants, one clear winner""" diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py index 359951ba38f07..6ae188398410e 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_continuous.py @@ -44,11 +44,9 @@ def calculate_probabilities_v2_continuous( Returns: -------- list[float] - A list of probabilities where each element represents the probability that the - corresponding variant is the best (has highest mean value) among all variants: - - index 0: probability control variant is best - - index i>0: probability test variant i-1 is best - All probabilities sum to 1.0 + A list of probabilities where each element represents: + - index 0: probability control variant beats the best test variant + - index i>0: probability test variant i-1 beats control Notes: ------ @@ -118,16 +116,14 @@ def calculate_probabilities_v2_continuous( # Calculate probabilities probabilities = [] - # Probability control wins (beats all test variants) - control_wins = np.all([samples_control > test_sample for test_sample in test_samples], axis=0) + # Probability control wins (beats the best test variant) + best_test_samples = np.max(test_samples, axis=0) # Get best test variant at each sample point + control_wins = samples_control > best_test_samples probabilities.append(float(np.mean(control_wins))) - # Probability each test variant wins (beats control and all other test variants) - for i, test_sample in enumerate(test_samples): - other_test_samples = test_samples[:i] + test_samples[i + 1 :] - variant_wins = np.all( - [test_sample > samples_control] + [test_sample > other for other in other_test_samples], axis=0 - ) + # Probability each test variant wins (beats control only) + for test_sample in test_samples: + variant_wins = test_sample > samples_control probabilities.append(float(np.mean(variant_wins))) return probabilities diff --git a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py index 38bf41ac0ca1d..7658e2622027a 100644 --- a/posthog/hogql_queries/experiments/trends_statistics_v2_count.py +++ b/posthog/hogql_queries/experiments/trends_statistics_v2_count.py @@ -36,9 +36,9 @@ def calculate_probabilities_v2_count( Returns: -------- list[float] - A list of probabilities that sum to 1, where: - - The first element is the probability that the control variant is the best - - Subsequent elements are the probabilities that each test variant is the best + A list of probabilities where each element represents: + - index 0: probability control variant beats the best test variant + - index i>0: probability test variant i-1 beats control Notes: ------ @@ -78,16 +78,14 @@ def calculate_probabilities_v2_count( # Calculate probabilities probabilities = [] - # Probability control wins (beats all test variants) - control_wins = np.all([samples_control > test_sample for test_sample in test_samples], axis=0) + # Probability control wins (beats the best test variant) + best_test_samples = np.max(test_samples, axis=0) + control_wins = samples_control > best_test_samples probabilities.append(float(np.mean(control_wins))) - # Probability each test variant wins (beats control and all other test variants) - for i, test_sample in enumerate(test_samples): - other_test_samples = test_samples[:i] + test_samples[i + 1 :] - variant_wins = np.all( - [test_sample > samples_control] + [test_sample > other for other in other_test_samples], axis=0 - ) + # Probability each test variant wins (beats control only) + for test_sample in test_samples: + variant_wins = test_sample > samples_control probabilities.append(float(np.mean(variant_wins))) return probabilities