Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(experiments): Calculate win probability against control #27804

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ee/clickhouse/models/test/__snapshots__/test_cohort.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@
WHERE equals(person_distinct_id_overrides.team_id, 99999)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__override ON equals(e.distinct_id, e__override.distinct_id)
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2023-01-21 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-21 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2023-01-23 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-23 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
GROUP BY actor_id) AS source
ORDER BY source.id ASC
LIMIT 100 SETTINGS optimize_aggregation_in_order=1,
Expand Down Expand Up @@ -374,7 +374,7 @@
actor_id AS id
FROM
(SELECT min(toTimeZone(e.timestamp, 'UTC')) AS min_timestamp,
minIf(toTimeZone(e.timestamp, 'UTC'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-06 00:00:00.000000', 6, 'UTC'))) AS min_timestamp_with_condition,
minIf(toTimeZone(e.timestamp, 'UTC'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-08 00:00:00.000000', 6, 'UTC'))) AS min_timestamp_with_condition,
if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id) AS actor_id,
argMin(e.uuid, toTimeZone(e.timestamp, 'UTC')) AS uuid,
argMin(e.distinct_id, toTimeZone(e.timestamp, 'UTC')) AS distinct_id
Expand All @@ -386,7 +386,7 @@
WHERE equals(person_distinct_id_overrides.team_id, 99999)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__override ON equals(e.distinct_id, e__override.distinct_id)
WHERE and(equals(e.team_id, 99999), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-21 23:59:59.999999', 6, 'UTC')), equals(e.event, 'signup'))
WHERE and(equals(e.team_id, 99999), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-23 23:59:59.999999', 6, 'UTC')), equals(e.event, 'signup'))
GROUP BY if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id)
HAVING ifNull(equals(min_timestamp, min_timestamp_with_condition), isNull(min_timestamp)
and isNull(min_timestamp_with_condition)))
Expand Down Expand Up @@ -474,7 +474,7 @@
WHERE equals(person_distinct_id_overrides.team_id, 99999)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__override ON equals(e.distinct_id, e__override.distinct_id)
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2023-01-21 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-21 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2023-01-23 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-23 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
GROUP BY actor_id) AS source
ORDER BY source.id ASC
LIMIT 100 SETTINGS optimize_aggregation_in_order=1,
Expand All @@ -488,7 +488,7 @@
actor_id AS id
FROM
(SELECT min(toTimeZone(e.timestamp, 'UTC')) AS min_timestamp,
minIf(toTimeZone(e.timestamp, 'UTC'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-06 00:00:00.000000', 6, 'UTC'))) AS min_timestamp_with_condition,
minIf(toTimeZone(e.timestamp, 'UTC'), greaterOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-08 00:00:00.000000', 6, 'UTC'))) AS min_timestamp_with_condition,
if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id) AS actor_id,
argMin(e.uuid, toTimeZone(e.timestamp, 'UTC')) AS uuid,
argMin(e.distinct_id, toTimeZone(e.timestamp, 'UTC')) AS distinct_id
Expand All @@ -500,7 +500,7 @@
WHERE equals(person_distinct_id_overrides.team_id, 99999)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__override ON equals(e.distinct_id, e__override.distinct_id)
WHERE and(equals(e.team_id, 99999), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-21 23:59:59.999999', 6, 'UTC')), equals(e.event, 'signup'))
WHERE and(equals(e.team_id, 99999), lessOrEquals(toTimeZone(e.timestamp, 'UTC'), toDateTime64('2025-01-23 23:59:59.999999', 6, 'UTC')), equals(e.event, 'signup'))
GROUP BY if(not(empty(e__override.distinct_id)), e__override.person_id, e.person_id)
HAVING ifNull(equals(min_timestamp, min_timestamp_with_condition), isNull(min_timestamp)
and isNull(min_timestamp_with_condition)))
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/scenes/experiments/experimentLogic.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1309,7 +1309,7 @@ export const experimentLogic = kea<experimentLogicType>([
}

if (results?.significance_code === ExperimentSignificanceCode.LowWinProbability) {
return 'This is because the win probability of all test variants combined is less than 90%.'
return 'This is because no variant (control or test) has a win probability higher than 90%.'
}

if (results?.significance_code === ExperimentSignificanceCode.NotEnoughExposure) {
Expand Down
2 changes: 1 addition & 1 deletion posthog/api/test/__snapshots__/test_cohort.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@
WHERE equals(person_distinct_id_overrides.team_id, 99999)
GROUP BY person_distinct_id_overrides.distinct_id
HAVING ifNull(equals(argMax(person_distinct_id_overrides.is_deleted, person_distinct_id_overrides.version), 0), 0) SETTINGS optimize_aggregation_in_order=1) AS e__override ON equals(e.distinct_id, e__override.distinct_id)
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2025-01-20 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-21 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
WHERE and(equals(e.team_id, 99999), greaterOrEquals(timestamp, toDateTime64('2025-01-22 00:00:00.000000', 6, 'UTC')), lessOrEquals(timestamp, toDateTime64('2025-01-23 23:59:59.999999', 6, 'UTC')), equals(e.event, '$pageview')))
GROUP BY actor_id) AS source
ORDER BY source.id ASC
LIMIT 100 SETTINGS optimize_aggregation_in_order=1,
Expand Down
22 changes: 16 additions & 6 deletions posthog/hogql_queries/experiments/funnels_statistics_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def calculate_probabilities_v2(
Returns:
--------
list[float]
A list of probabilities that sum to 1, where:
- The first element is the probability that the control variant is the best
- Subsequent elements are the probabilities that each test variant is the best
A list of probabilities where each element represents:
- index 0: probability control beats the best test variant
- index i>0: probability test variant i-1 beats control

Notes:
------
Expand Down Expand Up @@ -70,10 +70,20 @@ def calculate_probabilities_v2(
samples.append(variant_samples)

samples_array = np.array(samples)
# Calculate probability of each variant being the best
probabilities = []
for i in range(len(all_variants)):
probability = (samples_array[i] == np.max(samples_array, axis=0)).mean()
control_samples = samples_array[0] # Control is always first variant

# Find the best test variant at each sample point
test_variants_samples = samples_array[1:]
best_variant_samples = np.max(test_variants_samples, axis=0)

# Control's probability is of being better than the best test variant
control_prob = np.mean(control_samples >= best_variant_samples)
probabilities.append(float(control_prob))

# For each test variant, calculate probability of beating control
for i in range(1, len(all_variants)):
probability = np.mean(samples_array[i] > control_samples)
probabilities.append(float(probability))

return probabilities
Expand Down
29 changes: 29 additions & 0 deletions posthog/hogql_queries/experiments/test/test_funnels_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,35 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
# test_a is worse than control
# test_b is best overall
# test_c is slightly better than control
control = create_variant("control", success_count=100, failure_count=900) # 10% conversion
test_a = create_variant("test_a", success_count=80, failure_count=920) # 8% conversion
test_b = create_variant("test_b", success_count=150, failure_count=850) # 15% conversion
test_c = create_variant("test_c", success_count=110, failure_count=890) # 11% conversion

probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.76, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,9 +343,11 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertTrue(probabilities[2] > 0.9)
self.assertTrue(probabilities[1] < 0.1)
self.assertTrue(probabilities[0] < 0.1)
self.assertTrue(probabilities[1] > 0.9)
self.assertTrue(probabilities[2] > 0.9)
self.assertTrue(probabilities[3] > 0.9)

self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
self.assertEqual(p_value, 0)

Expand Down Expand Up @@ -389,6 +391,55 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
control_absolute_exposure = 1000
control = create_variant(
"control",
total_sum=100.0 * control_absolute_exposure,
exposure=1,
absolute_exposure=control_absolute_exposure,
)
test_a_absolute_exposure = 1000
test_a = create_variant(
"test_a",
total_sum=85.0 * test_a_absolute_exposure,
exposure=test_a_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_a_absolute_exposure,
)
test_b_absolute_exposure = 1000
test_b = create_variant(
"test_b",
total_sum=150.0 * test_b_absolute_exposure,
exposure=test_b_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_b_absolute_exposure,
)
test_c_absolute_exposure = 1000
test_c = create_variant(
"test_c",
total_sum=110.0 * test_c_absolute_exposure,
exposure=test_c_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_c_absolute_exposure,
)
probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.05, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.99, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_insufficient_sample_size(self):
"""Test with sample size below threshold"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,18 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca
intervals = calculate_credible_intervals([control, test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
self.assertTrue(probabilities[2] > 0.9) # test_b should be winning
self.assertTrue(probabilities[1] < 0.1) # test_a should be losing
self.assertTrue(probabilities[0] < 0.1) # control should be losing
self.assertEqual(significance, ExperimentSignificanceCode.SIGNIFICANT)
if stats_version == 2:
self.assertTrue(probabilities[0] < 0.1) # control is losing
self.assertTrue(probabilities[1] > 0.7) # test_a beats control, but less confidently
self.assertTrue(probabilities[2] > 0.9) # test_b beats control
self.assertTrue(probabilities[3] > 0.9) # test_c beats control
self.assertEqual(p_value, 0)
else:
self.assertTrue(probabilities[0] < 0.1) # control should be losing
self.assertTrue(probabilities[1] < 0.1) # test_a should be losing
self.assertTrue(probabilities[2] > 0.9) # test_b should be winning
self.assertTrue(probabilities[3] < 0.1) # test_c should be losing
self.assertLess(p_value, 0.001)

# Control at 10%
Expand All @@ -274,6 +279,51 @@ def run_test(stats_version, calculate_probabilities, are_results_significant, ca

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_many_variants_win_probabilty_compared_to_control(self):
"""Test with multiple variants, win probability compared to control"""

def run_test(stats_version, calculate_probabilities, are_results_significant, calculate_credible_intervals):
control_absolute_exposure = 1000
control = create_variant("control", count=100, exposure=1, absolute_exposure=control_absolute_exposure)
test_a_absolute_exposure = 1000
test_a = create_variant(
"test_a",
count=85,
exposure=test_a_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_a_absolute_exposure,
)
test_b_absolute_exposure = 1000
test_b = create_variant(
"test_b",
count=150,
exposure=test_b_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_b_absolute_exposure,
)
test_c_absolute_exposure = 1000
test_c = create_variant(
"test_c",
count=110,
exposure=test_c_absolute_exposure / control_absolute_exposure,
absolute_exposure=test_c_absolute_exposure,
)

probabilities = calculate_probabilities(control, [test_a, test_b, test_c])

self.assertEqual(len(probabilities), 4)
if stats_version == 2:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0.13, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.75, delta=0.05)
else:
self.assertAlmostEqual(probabilities[0], 0, delta=0.05)
self.assertAlmostEqual(probabilities[1], 0, delta=0.05)
self.assertAlmostEqual(probabilities[2], 0.99, delta=0.05)
self.assertAlmostEqual(probabilities[3], 0.0, delta=0.05)

self.run_test_for_both_implementations(run_test)

@flaky(max_runs=5, min_passes=1)
def test_real_world_data_1(self):
"""Test with multiple variants, one clear winner"""
Expand Down
Loading
Loading