A/B Testing in Production: Statistical Significance, Sample Size, and Rollout Testing
A/B testing infrastructure is code, and code needs tests. This guide covers testing the experiment layer itself: variant assignment consistency, statistical significance calculations, minimum detectable effect, sample size requirements, and percentage rollout correctness—not just whether the feature exists behind the flag.
Testing the Test Infrastructure
Most teams write tests for features behind A/B flags, but not for the A/B testing machinery itself. The machinery is what determines whether your results are valid. Bugs here are worse than feature bugs: they produce false positives, make you ship regressions based on "winning" experiments, or silently invalidate weeks of data collection.
The experiment layer has several testable components:
- Assignment — does a given user always get assigned the same variant?
- Exclusion — are excluded users actually excluded?
- Distribution — does a 50/50 split actually produce ~50% in each bucket?
- Statistical significance — is your significance calculation correct?
- Sample size — is the minimum sample size enforced before declaring a winner?
Testing Variant Assignment
Assignment must be deterministic: the same user ID must always get the same variant. If assignment varies between page loads, your experiment is corrupted.
// src/experiment.js
const crypto = require('crypto');
function assignVariant(userId, experimentId, variants = ['control', 'treatment']) {
const hash = crypto
.createHash('sha256')
.update(`${experimentId}:${userId}`)
.digest('hex');
const bucket = parseInt(hash.slice(0, 8), 16) % variants.length;
return variants[bucket];
}
module.exports = { assignVariant };// test/experiment.test.js
const { assignVariant } = require('../src/experiment');
describe('assignVariant', () => {
it('assigns the same variant on repeated calls for the same user', () => {
const userId = 'user_123';
const experimentId = 'checkout_cta';
const first = assignVariant(userId, experimentId);
const second = assignVariant(userId, experimentId);
const third = assignVariant(userId, experimentId);
expect(first).toBe(second);
expect(second).toBe(third);
});
it('assigns different users to different variants', () => {
const experimentId = 'checkout_cta';
const variants = new Set();
for (let i = 0; i < 1000; i++) {
variants.add(assignVariant(`user_${i}`, experimentId));
}
// Both variants should be assigned across 1000 users
expect(variants).toContain('control');
expect(variants).toContain('treatment');
});
it('distributes users approximately evenly with default 50/50 split', () => {
const experimentId = 'homepage_hero';
const counts = { control: 0, treatment: 0 };
for (let i = 0; i < 10000; i++) {
const variant = assignVariant(`user_${i}`, experimentId);
counts[variant]++;
}
// Allow 5% deviation from 50/50
expect(counts.control / 10000).toBeCloseTo(0.5, 1);
expect(counts.treatment / 10000).toBeCloseTo(0.5, 1);
});
it('different experiments produce independent assignments', () => {
// User assigned to control in experiment A should not be predictably
// in control for experiment B
const userId = 'user_abc';
const expA = assignVariant(userId, 'experiment_a');
const expB = assignVariant(userId, 'experiment_b');
// We can't assert they're different (could coincidentally be the same)
// but we can check both are valid variants
expect(['control', 'treatment']).toContain(expA);
expect(['control', 'treatment']).toContain(expB);
});
});Testing Percentage Rollouts
Progressive rollouts (1% → 5% → 25% → 50% → 100%) require testing that the percentage cutoff is respected:
// src/rollout.js
function isInRollout(userId, featureKey, rolloutPercentage) {
if (rolloutPercentage <= 0) return false;
if (rolloutPercentage >= 100) return true;
const hash = crypto
.createHash('sha256')
.update(`${featureKey}:rollout:${userId}`)
.digest('hex');
const bucket = (parseInt(hash.slice(0, 8), 16) % 100) + 1;
return bucket <= rolloutPercentage;
}
module.exports = { isInRollout };// test/rollout.test.js
const { isInRollout } = require('../src/rollout');
describe('isInRollout', () => {
it('includes 0% of users at 0% rollout', () => {
const included = Array.from({ length: 1000 }, (_, i) =>
isInRollout(`user_${i}`, 'feature_x', 0)
);
expect(included.every(v => v === false)).toBe(true);
});
it('includes all users at 100% rollout', () => {
const included = Array.from({ length: 1000 }, (_, i) =>
isInRollout(`user_${i}`, 'feature_x', 100)
);
expect(included.every(v => v === true)).toBe(true);
});
it('includes approximately the correct percentage at 10%', () => {
const count = Array.from({ length: 10000 }, (_, i) =>
isInRollout(`user_${i}`, 'feature_x', 10)
).filter(Boolean).length;
// Allow ±2% deviation
expect(count / 10000).toBeCloseTo(0.10, 1);
});
it('monotonically includes users as percentage increases', () => {
// Any user included at 10% must also be included at 20%
for (let i = 0; i < 100; i++) {
const at10 = isInRollout(`user_${i}`, 'feature_x', 10);
const at20 = isInRollout(`user_${i}`, 'feature_x', 20);
if (at10) {
expect(at20).toBe(true); // Included at 10% → must be included at 20%
}
}
});
});The monotonicity test is critical: if a user was in the 10% rollout, rolling to 20% should add new users, not swap them. Failing this test means your rollout key changes with percentage, which is a serious bug.
Testing Statistical Significance
Your A/B testing dashboard needs to calculate statistical significance correctly. Test the math:
// src/stats.js
/**
* Two-proportion z-test for A/B test significance.
* Returns p-value (two-tailed).
*/
function calculatePValue(controlConversions, controlTotal, treatmentConversions, treatmentTotal) {
const p1 = controlConversions / controlTotal;
const p2 = treatmentConversions / treatmentTotal;
const pooled = (controlConversions + treatmentConversions) / (controlTotal + treatmentTotal);
const se = Math.sqrt(pooled * (1 - pooled) * (1 / controlTotal + 1 / treatmentTotal));
if (se === 0) return 1; // No variance
const z = Math.abs(p1 - p2) / se;
// Approximation of the two-tailed p-value from z-score
return 2 * (1 - normalCDF(z));
}
function normalCDF(z) {
const t = 1 / (1 + 0.2316419 * Math.abs(z));
const poly = t * (0.319381530 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429))));
return 1 - (1 / Math.sqrt(2 * Math.PI)) * Math.exp(-0.5 * z * z) * poly;
}
function isStatisticallySignificant(pValue, alpha = 0.05) {
return pValue < alpha;
}
module.exports = { calculatePValue, isStatisticallySignificant };// test/stats.test.js
const { calculatePValue, isStatisticallySignificant } = require('../src/stats');
describe('calculatePValue', () => {
it('returns near 1.0 for identical conversion rates', () => {
// Same rates → no difference → p ≈ 1
const p = calculatePValue(100, 1000, 100, 1000);
expect(p).toBeGreaterThan(0.9);
});
it('returns low p-value for clearly different conversion rates', () => {
// 10% vs 20% with large sample → should be highly significant
const p = calculatePValue(100, 1000, 200, 1000);
expect(p).toBeLessThan(0.001);
});
it('returns p < 0.05 for known significant result', () => {
// 5% vs 7% with 5000 users each — calculated expected p ≈ 0.002
const p = calculatePValue(250, 5000, 350, 5000);
expect(p).toBeLessThan(0.05);
});
it('handles equal sample sizes correctly', () => {
const p = calculatePValue(50, 500, 75, 500);
expect(p).toBeGreaterThan(0);
expect(p).toBeLessThan(1);
});
});
describe('isStatisticallySignificant', () => {
it('is significant at p=0.01 with alpha=0.05', () => {
expect(isStatisticallySignificant(0.01)).toBe(true);
});
it('is not significant at p=0.10 with alpha=0.05', () => {
expect(isStatisticallySignificant(0.10)).toBe(false);
});
it('uses custom alpha', () => {
expect(isStatisticallySignificant(0.04, 0.01)).toBe(false); // 4% > 1%
expect(isStatisticallySignificant(0.004, 0.01)).toBe(true); // 0.4% < 1%
});
});Testing Minimum Sample Size Requirements
Stopping an experiment before it reaches minimum sample size is the most common A/B testing mistake. Enforce it in code and test it:
// src/experiment-guard.js
function minimumSampleSize(baselineRate, minimumDetectableEffect, power = 0.8, alpha = 0.05) {
// Using the standard formula for two-proportion test
const zAlpha = 1.96; // for alpha = 0.05, two-tailed
const zBeta = 0.8416; // for power = 0.80
const p1 = baselineRate;
const p2 = baselineRate * (1 + minimumDetectableEffect);
const pBar = (p1 + p2) / 2;
const numerator = Math.pow(zAlpha * Math.sqrt(2 * pBar * (1 - pBar)) + zBeta * Math.sqrt(p1 * (1 - p1) + p2 * (1 - p2)), 2);
const denominator = Math.pow(p2 - p1, 2);
return Math.ceil(numerator / denominator);
}
function canDeclareWinner(experiment) {
const required = minimumSampleSize(
experiment.baselineConversionRate,
experiment.minimumDetectableEffect
);
return experiment.controlTotal >= required && experiment.treatmentTotal >= required;
}
module.exports = { minimumSampleSize, canDeclareWinner };// test/experiment-guard.test.js
const { minimumSampleSize, canDeclareWinner } = require('../src/experiment-guard');
describe('minimumSampleSize', () => {
it('requires larger sample for smaller effects', () => {
const largeEffect = minimumSampleSize(0.05, 0.20); // 20% relative improvement
const smallEffect = minimumSampleSize(0.05, 0.05); // 5% relative improvement
expect(smallEffect).toBeGreaterThan(largeEffect);
});
it('requires smaller sample for higher baseline rate', () => {
const lowBase = minimumSampleSize(0.01, 0.20);
const highBase = minimumSampleSize(0.30, 0.20);
expect(highBase).toBeLessThan(lowBase);
});
it('returns positive integer', () => {
const n = minimumSampleSize(0.05, 0.10);
expect(Number.isInteger(n)).toBe(true);
expect(n).toBeGreaterThan(0);
});
});
describe('canDeclareWinner', () => {
it('blocks early stopping', () => {
const experiment = {
baselineConversionRate: 0.05,
minimumDetectableEffect: 0.10,
controlTotal: 100, // way below minimum
treatmentTotal: 100,
};
expect(canDeclareWinner(experiment)).toBe(false);
});
it('allows declaration after sufficient sample', () => {
const required = minimumSampleSize(0.05, 0.10);
const experiment = {
baselineConversionRate: 0.05,
minimumDetectableEffect: 0.10,
controlTotal: required + 100,
treatmentTotal: required + 100,
};
expect(canDeclareWinner(experiment)).toBe(true);
});
});Integration With HelpMeTest
Running A/B tests requires verifying that the right variant is shown to the right users in a real browser—not just in unit tests. HelpMeTest lets you write browser tests that assert on experiment assignment behavior across sessions, confirm that variant UIs render correctly, and verify that conversion events are tracked for each bucket.
Summary
- Test variant assignment for determinism (same user always gets same variant)
- Test distribution uniformity with large samples (1000+ users to detect skew)
- Test monotonicity of percentage rollouts (users included at 10% stay included at 20%)
- Test your statistical significance calculation against known expected outputs
- Test minimum sample size enforcement to prevent early stopping errors
- These tests protect the validity of your experimental results, not just the features under test