""" A/B testing module for App Store Optimization. Plans and tracks A/B tests for metadata and visual assets. """ from typing import Dict, List, Any, Optional import math class ABTestPlanner: """Plans and tracks A/B tests for ASO elements.""" # Minimum detectable effect sizes (conservative estimates) MIN_EFFECT_SIZES = { 'icon': 0.10, # 10% conversion improvement 'screenshot': 0.08, # 8% conversion improvement 'title': 0.05, # 5% conversion improvement 'description': 0.03 # 3% conversion improvement } # Statistical confidence levels CONFIDENCE_LEVELS = { 'high': 0.95, # 95% confidence 'standard': 0.90, # 90% confidence 'exploratory': 0.80 # 80% confidence } def __init__(self): """Initialize A/B test planner.""" self.active_tests = [] def design_test( self, test_type: str, variant_a: Dict[str, Any], variant_b: Dict[str, Any], hypothesis: str, success_metric: str = 'conversion_rate' ) -> Dict[str, Any]: """ Design an A/B test with hypothesis and variables. Args: test_type: Type of test ('icon', 'screenshot', 'title', 'description') variant_a: Control variant details variant_b: Test variant details hypothesis: Expected outcome hypothesis success_metric: Metric to optimize Returns: Test design with configuration """ test_design = { 'test_id': self._generate_test_id(test_type), 'test_type': test_type, 'hypothesis': hypothesis, 'variants': { 'a': { 'name': 'Control', 'details': variant_a, 'traffic_split': 0.5 }, 'b': { 'name': 'Variation', 'details': variant_b, 'traffic_split': 0.5 } }, 'success_metric': success_metric, 'secondary_metrics': self._get_secondary_metrics(test_type), 'minimum_effect_size': self.MIN_EFFECT_SIZES.get(test_type, 0.05), 'recommended_confidence': 'standard', 'best_practices': self._get_test_best_practices(test_type) } self.active_tests.append(test_design) return test_design def calculate_sample_size( self, baseline_conversion: float, minimum_detectable_effect: float, confidence_level: str = 'standard', power: float = 0.80 ) -> Dict[str, Any]: """ Calculate required sample size for statistical significance. Args: baseline_conversion: Current conversion rate (0-1) minimum_detectable_effect: Minimum effect size to detect (0-1) confidence_level: 'high', 'standard', or 'exploratory' power: Statistical power (typically 0.80 or 0.90) Returns: Sample size calculation with duration estimates """ alpha = 1 - self.CONFIDENCE_LEVELS[confidence_level] beta = 1 - power # Expected conversion for variant B expected_conversion_b = baseline_conversion * (1 + minimum_detectable_effect) # Z-scores for alpha and beta z_alpha = self._get_z_score(1 - alpha / 2) # Two-tailed test z_beta = self._get_z_score(power) # Pooled standard deviation p_pooled = (baseline_conversion + expected_conversion_b) / 2 sd_pooled = math.sqrt(2 * p_pooled * (1 - p_pooled)) # Sample size per variant n_per_variant = math.ceil( ((z_alpha + z_beta) ** 2 * sd_pooled ** 2) / ((expected_conversion_b - baseline_conversion) ** 2) ) total_sample_size = n_per_variant * 2 # Estimate duration based on typical traffic duration_estimates = self._estimate_test_duration( total_sample_size, baseline_conversion ) return { 'sample_size_per_variant': n_per_variant, 'total_sample_size': total_sample_size, 'baseline_conversion': baseline_conversion, 'expected_conversion_improvement': minimum_detectable_effect, 'expected_conversion_b': expected_conversion_b, 'confidence_level': confidence_level, 'statistical_power': power, 'duration_estimates': duration_estimates, 'recommendations': self._generate_sample_size_recommendations( n_per_variant, duration_estimates ) } def calculate_significance( self, variant_a_conversions: int, variant_a_visitors: int, variant_b_conversions: int, variant_b_visitors: int ) -> Dict[str, Any]: """ Calculate statistical significance of test results. Args: variant_a_conversions: Conversions for control variant_a_visitors: Visitors for control variant_b_conversions: Conversions for variation variant_b_visitors: Visitors for variation Returns: Significance analysis with decision recommendation """ # Calculate conversion rates rate_a = variant_a_conversions / variant_a_visitors if variant_a_visitors > 0 else 0 rate_b = variant_b_conversions / variant_b_visitors if variant_b_visitors > 0 else 0 # Calculate improvement if rate_a > 0: relative_improvement = (rate_b - rate_a) / rate_a else: relative_improvement = 0 absolute_improvement = rate_b - rate_a # Calculate standard error se_a = math.sqrt(rate_a * (1 - rate_a) / variant_a_visitors) if variant_a_visitors > 0 else 0 se_b = math.sqrt(rate_b * (1 - rate_b) / variant_b_visitors) if variant_b_visitors > 0 else 0 se_diff = math.sqrt(se_a**2 + se_b**2) # Calculate z-score z_score = absolute_improvement / se_diff if se_diff > 0 else 0 # Calculate p-value (two-tailed) p_value = 2 * (1 - self._standard_normal_cdf(abs(z_score))) # Determine significance is_significant_95 = p_value < 0.05 is_significant_90 = p_value < 0.10 # Generate decision decision = self._generate_test_decision( relative_improvement, is_significant_95, is_significant_90, variant_a_visitors + variant_b_visitors ) return { 'variant_a': { 'conversions': variant_a_conversions, 'visitors': variant_a_visitors, 'conversion_rate': round(rate_a, 4) }, 'variant_b': { 'conversions': variant_b_conversions, 'visitors': variant_b_visitors, 'conversion_rate': round(rate_b, 4) }, 'improvement': { 'absolute': round(absolute_improvement, 4), 'relative_percentage': round(relative_improvement * 100, 2) }, 'statistical_analysis': { 'z_score': round(z_score, 3), 'p_value': round(p_value, 4), 'is_significant_95': is_significant_95, 'is_significant_90': is_significant_90, 'confidence_level': '95%' if is_significant_95 else ('90%' if is_significant_90 else 'Not significant') }, 'decision': decision } def track_test_results( self, test_id: str, results_data: Dict[str, Any] ) -> Dict[str, Any]: """ Track ongoing test results and provide recommendations. Args: test_id: Test identifier results_data: Current test results Returns: Test tracking report with next steps """ # Find test test = next((t for t in self.active_tests if t['test_id'] == test_id), None) if not test: return {'error': f'Test {test_id} not found'} # Calculate significance significance = self.calculate_significance( results_data['variant_a_conversions'], results_data['variant_a_visitors'], results_data['variant_b_conversions'], results_data['variant_b_visitors'] ) # Calculate test progress total_visitors = results_data['variant_a_visitors'] + results_data['variant_b_visitors'] required_sample = results_data.get('required_sample_size', 10000) progress_percentage = min((total_visitors / required_sample) * 100, 100) # Generate recommendations recommendations = self._generate_tracking_recommendations( significance, progress_percentage, test['test_type'] ) return { 'test_id': test_id, 'test_type': test['test_type'], 'progress': { 'total_visitors': total_visitors, 'required_sample_size': required_sample, 'progress_percentage': round(progress_percentage, 1), 'is_complete': progress_percentage >= 100 }, 'current_results': significance, 'recommendations': recommendations, 'next_steps': self._determine_next_steps( significance, progress_percentage ) } def generate_test_report( self, test_id: str, final_results: Dict[str, Any] ) -> Dict[str, Any]: """ Generate final test report with insights and recommendations. Args: test_id: Test identifier final_results: Final test results Returns: Comprehensive test report """ test = next((t for t in self.active_tests if t['test_id'] == test_id), None) if not test: return {'error': f'Test {test_id} not found'} significance = self.calculate_significance( final_results['variant_a_conversions'], final_results['variant_a_visitors'], final_results['variant_b_conversions'], final_results['variant_b_visitors'] ) # Generate insights insights = self._generate_test_insights( test, significance, final_results ) # Implementation plan implementation_plan = self._create_implementation_plan( test, significance ) return { 'test_summary': { 'test_id': test_id, 'test_type': test['test_type'], 'hypothesis': test['hypothesis'], 'duration_days': final_results.get('duration_days', 'N/A') }, 'results': significance, 'insights': insights, 'implementation_plan': implementation_plan, 'learnings': self._extract_learnings(test, significance) } def _generate_test_id(self, test_type: str) -> str: """Generate unique test ID.""" import time timestamp = int(time.time()) return f"{test_type}_{timestamp}" def _get_secondary_metrics(self, test_type: str) -> List[str]: """Get secondary metrics to track for test type.""" metrics_map = { 'icon': ['tap_through_rate', 'impression_count', 'brand_recall'], 'screenshot': ['tap_through_rate', 'time_on_page', 'scroll_depth'], 'title': ['impression_count', 'tap_through_rate', 'search_visibility'], 'description': ['time_on_page', 'scroll_depth', 'tap_through_rate'] } return metrics_map.get(test_type, ['tap_through_rate']) def _get_test_best_practices(self, test_type: str) -> List[str]: """Get best practices for specific test type.""" practices_map = { 'icon': [ 'Test only one element at a time (color vs. style vs. symbolism)', 'Ensure icon is recognizable at small sizes (60x60px)', 'Consider cultural context for global audience', 'Test against top competitor icons' ], 'screenshot': [ 'Test order of screenshots (users see first 2-3)', 'Use captions to tell story', 'Show key features and benefits', 'Test with and without device frames' ], 'title': [ 'Test keyword variations, not major rebrand', 'Keep brand name consistent', 'Ensure title fits within character limits', 'Test on both search and browse contexts' ], 'description': [ 'Test structure (bullet points vs. paragraphs)', 'Test call-to-action placement', 'Test feature vs. benefit focus', 'Maintain keyword density' ] } return practices_map.get(test_type, ['Test one variable at a time']) def _estimate_test_duration( self, required_sample_size: int, baseline_conversion: float ) -> Dict[str, Any]: """Estimate test duration based on typical traffic levels.""" # Assume different daily traffic scenarios traffic_scenarios = { 'low': 100, # 100 page views/day 'medium': 1000, # 1000 page views/day 'high': 10000 # 10000 page views/day } estimates = {} for scenario, daily_views in traffic_scenarios.items(): days = math.ceil(required_sample_size / daily_views) estimates[scenario] = { 'daily_page_views': daily_views, 'estimated_days': days, 'estimated_weeks': round(days / 7, 1) } return estimates def _generate_sample_size_recommendations( self, sample_size: int, duration_estimates: Dict[str, Any] ) -> List[str]: """Generate recommendations based on sample size.""" recommendations = [] if sample_size > 50000: recommendations.append( "Large sample size required - consider testing smaller effect size or increasing traffic" ) if duration_estimates['medium']['estimated_days'] > 30: recommendations.append( "Long test duration - consider higher minimum detectable effect or focus on high-impact changes" ) if duration_estimates['low']['estimated_days'] > 60: recommendations.append( "Insufficient traffic for reliable testing - consider user acquisition or broader targeting" ) if not recommendations: recommendations.append("Sample size and duration are reasonable for this test") return recommendations def _get_z_score(self, percentile: float) -> float: """Get z-score for given percentile (approximation).""" # Common z-scores z_scores = { 0.80: 0.84, 0.85: 1.04, 0.90: 1.28, 0.95: 1.645, 0.975: 1.96, 0.99: 2.33 } return z_scores.get(percentile, 1.96) def _standard_normal_cdf(self, z: float) -> float: """Approximate standard normal cumulative distribution function.""" # Using error function approximation t = 1.0 / (1.0 + 0.2316419 * abs(z)) d = 0.3989423 * math.exp(-z * z / 2.0) p = d * t * (0.3193815 + t * (-0.3565638 + t * (1.781478 + t * (-1.821256 + t * 1.330274)))) if z > 0: return 1.0 - p else: return p def _generate_test_decision( self, improvement: float, is_significant_95: bool, is_significant_90: bool, total_visitors: int ) -> Dict[str, Any]: """Generate test decision and recommendation.""" if total_visitors < 1000: return { 'decision': 'continue', 'rationale': 'Insufficient data - continue test to reach minimum sample size', 'action': 'Keep test running' } if is_significant_95: if improvement > 0: return { 'decision': 'implement_b', 'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 95% confidence', 'action': 'Implement Variant B' } else: return { 'decision': 'keep_a', 'rationale': 'Variant A performs better with 95% confidence', 'action': 'Keep current version (A)' } elif is_significant_90: if improvement > 0: return { 'decision': 'implement_b_cautiously', 'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 90% confidence', 'action': 'Consider implementing B, monitor closely' } else: return { 'decision': 'keep_a', 'rationale': 'Variant A performs better with 90% confidence', 'action': 'Keep current version (A)' } else: return { 'decision': 'inconclusive', 'rationale': 'No statistically significant difference detected', 'action': 'Either keep A or test different hypothesis' } def _generate_tracking_recommendations( self, significance: Dict[str, Any], progress: float, test_type: str ) -> List[str]: """Generate recommendations for ongoing test.""" recommendations = [] if progress < 50: recommendations.append( f"Test is {progress:.0f}% complete - continue collecting data" ) if progress >= 100: if significance['statistical_analysis']['is_significant_95']: recommendations.append( "Sufficient data collected with significant results - ready to conclude test" ) else: recommendations.append( "Sample size reached but no significant difference - consider extending test or concluding" ) return recommendations def _determine_next_steps( self, significance: Dict[str, Any], progress: float ) -> str: """Determine next steps for test.""" if progress < 100: return f"Continue test until reaching 100% sample size (currently {progress:.0f}%)" decision = significance.get('decision', {}).get('decision', 'inconclusive') if decision == 'implement_b': return "Implement Variant B and monitor metrics for 2 weeks" elif decision == 'keep_a': return "Keep Variant A and design new test with different hypothesis" else: return "Test inconclusive - either keep A or design new test" def _generate_test_insights( self, test: Dict[str, Any], significance: Dict[str, Any], results: Dict[str, Any] ) -> List[str]: """Generate insights from test results.""" insights = [] improvement = significance['improvement']['relative_percentage'] if significance['statistical_analysis']['is_significant_95']: insights.append( f"Strong evidence: Variant B {'improved' if improvement > 0 else 'decreased'} " f"conversion by {abs(improvement):.1f}% with 95% confidence" ) insights.append( f"Tested {test['test_type']} changes: {test['hypothesis']}" ) # Add context-specific insights if test['test_type'] == 'icon' and improvement > 5: insights.append( "Icon change had substantial impact - visual first impression is critical" ) return insights def _create_implementation_plan( self, test: Dict[str, Any], significance: Dict[str, Any] ) -> List[Dict[str, str]]: """Create implementation plan for winning variant.""" plan = [] if significance.get('decision', {}).get('decision') == 'implement_b': plan.append({ 'step': '1. Update store listing', 'details': f"Replace {test['test_type']} with Variant B across all platforms" }) plan.append({ 'step': '2. Monitor metrics', 'details': 'Track conversion rate for 2 weeks to confirm sustained improvement' }) plan.append({ 'step': '3. Document learnings', 'details': 'Record insights for future optimization' }) return plan def _extract_learnings( self, test: Dict[str, Any], significance: Dict[str, Any] ) -> List[str]: """Extract key learnings from test.""" learnings = [] improvement = significance['improvement']['relative_percentage'] learnings.append( f"Testing {test['test_type']} can yield {abs(improvement):.1f}% conversion change" ) if test['test_type'] == 'title': learnings.append( "Title changes affect search visibility and user perception" ) elif test['test_type'] == 'screenshot': learnings.append( "First 2-3 screenshots are critical for conversion" ) return learnings def plan_ab_test( test_type: str, variant_a: Dict[str, Any], variant_b: Dict[str, Any], hypothesis: str, baseline_conversion: float ) -> Dict[str, Any]: """ Convenience function to plan an A/B test. Args: test_type: Type of test variant_a: Control variant variant_b: Test variant hypothesis: Test hypothesis baseline_conversion: Current conversion rate Returns: Complete test plan """ planner = ABTestPlanner() test_design = planner.design_test( test_type, variant_a, variant_b, hypothesis ) sample_size = planner.calculate_sample_size( baseline_conversion, planner.MIN_EFFECT_SIZES.get(test_type, 0.05) ) return { 'test_design': test_design, 'sample_size_requirements': sample_size }