#!/usr/bin/env python3 """ ETL Performance Optimizer Comprehensive ETL/ELT performance analysis and optimization tool. Features: - SQL query analysis and optimization recommendations - Spark job configuration analysis - Data skew detection and mitigation - Partition strategy recommendations - Join optimization suggestions - Memory and shuffle analysis - Cost estimation for cloud warehouses Usage: python etl_performance_optimizer.py analyze-sql query.sql python etl_performance_optimizer.py analyze-spark spark-history.json python etl_performance_optimizer.py optimize-partition data_stats.json python etl_performance_optimizer.py estimate-cost query.sql --warehouse snowflake """ import os import sys import json import re import argparse import logging import math from pathlib import Path from typing import Dict, List, Optional, Any, Tuple, Set from dataclasses import dataclass, field, asdict from datetime import datetime from collections import defaultdict from abc import ABC, abstractmethod logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # ============================================================================= # Data Classes # ============================================================================= @dataclass class SQLQueryInfo: """Parsed information about a SQL query""" query_type: str # SELECT, INSERT, UPDATE, DELETE, MERGE, CREATE tables: List[str] columns: List[str] joins: List[Dict[str, str]] where_conditions: List[str] group_by: List[str] order_by: List[str] aggregations: List[str] subqueries: int distinct: bool limit: Optional[int] ctes: List[str] window_functions: List[str] estimated_complexity: str # low, medium, high, very_high @dataclass class OptimizationRecommendation: """A single optimization recommendation""" category: str # index, partition, join, filter, aggregation, memory, shuffle severity: str # critical, high, medium, low title: str description: str current_issue: str recommendation: str expected_improvement: str implementation: str priority: int = 1 @dataclass class SparkJobMetrics: """Metrics from a Spark job""" job_id: str duration_ms: int stages: int tasks: int shuffle_read_bytes: int shuffle_write_bytes: int input_bytes: int output_bytes: int peak_memory_bytes: int gc_time_ms: int failed_tasks: int speculative_tasks: int skew_ratio: float # max_task_time / median_task_time @dataclass class PartitionStrategy: """Recommended partition strategy""" column: str partition_type: str # range, hash, list num_partitions: Optional[int] partition_size_mb: float reasoning: str implementation: str @dataclass class CostEstimate: """Cost estimate for a query""" warehouse: str compute_cost: float storage_cost: float data_transfer_cost: float total_cost: float currency: str = "USD" assumptions: List[str] = field(default_factory=list) # ============================================================================= # SQL Parser # ============================================================================= class SQLParser: """Parse and analyze SQL queries""" # Common SQL patterns PATTERNS = { 'select': re.compile(r'\bSELECT\b', re.IGNORECASE), 'from': re.compile(r'\bFROM\b', re.IGNORECASE), 'join': re.compile(r'\b(INNER|LEFT|RIGHT|FULL|CROSS)?\s*JOIN\b', re.IGNORECASE), 'where': re.compile(r'\bWHERE\b', re.IGNORECASE), 'group_by': re.compile(r'\bGROUP\s+BY\b', re.IGNORECASE), 'order_by': re.compile(r'\bORDER\s+BY\b', re.IGNORECASE), 'having': re.compile(r'\bHAVING\b', re.IGNORECASE), 'distinct': re.compile(r'\bDISTINCT\b', re.IGNORECASE), 'limit': re.compile(r'\bLIMIT\s+(\d+)', re.IGNORECASE), 'cte': re.compile(r'\bWITH\b', re.IGNORECASE), 'subquery': re.compile(r'\(\s*SELECT\b', re.IGNORECASE), 'window': re.compile(r'\bOVER\s*\(', re.IGNORECASE), 'aggregation': re.compile(r'\b(COUNT|SUM|AVG|MIN|MAX|STDDEV|VARIANCE)\s*\(', re.IGNORECASE), 'insert': re.compile(r'\bINSERT\s+INTO\b', re.IGNORECASE), 'update': re.compile(r'\bUPDATE\b', re.IGNORECASE), 'delete': re.compile(r'\bDELETE\s+FROM\b', re.IGNORECASE), 'merge': re.compile(r'\bMERGE\s+INTO\b', re.IGNORECASE), 'create': re.compile(r'\bCREATE\s+(TABLE|VIEW|INDEX)\b', re.IGNORECASE), } def parse(self, sql: str) -> SQLQueryInfo: """Parse a SQL query and extract information""" # Clean up the query sql = self._clean_sql(sql) # Determine query type query_type = self._detect_query_type(sql) # Extract tables tables = self._extract_tables(sql) # Extract columns (for SELECT queries) columns = self._extract_columns(sql) if query_type == 'SELECT' else [] # Extract joins joins = self._extract_joins(sql) # Extract WHERE conditions where_conditions = self._extract_where_conditions(sql) # Extract GROUP BY group_by = self._extract_group_by(sql) # Extract ORDER BY order_by = self._extract_order_by(sql) # Extract aggregations aggregations = self._extract_aggregations(sql) # Count subqueries subqueries = len(self.PATTERNS['subquery'].findall(sql)) # Check for DISTINCT distinct = bool(self.PATTERNS['distinct'].search(sql)) # Extract LIMIT limit_match = self.PATTERNS['limit'].search(sql) limit = int(limit_match.group(1)) if limit_match else None # Extract CTEs ctes = self._extract_ctes(sql) # Extract window functions window_functions = self._extract_window_functions(sql) # Estimate complexity complexity = self._estimate_complexity( tables, joins, subqueries, aggregations, window_functions ) return SQLQueryInfo( query_type=query_type, tables=tables, columns=columns, joins=joins, where_conditions=where_conditions, group_by=group_by, order_by=order_by, aggregations=aggregations, subqueries=subqueries, distinct=distinct, limit=limit, ctes=ctes, window_functions=window_functions, estimated_complexity=complexity ) def _clean_sql(self, sql: str) -> str: """Clean and normalize SQL""" # Remove comments sql = re.sub(r'--.*$', '', sql, flags=re.MULTILINE) sql = re.sub(r'/\*.*?\*/', '', sql, flags=re.DOTALL) # Normalize whitespace sql = ' '.join(sql.split()) return sql def _detect_query_type(self, sql: str) -> str: """Detect the type of SQL query""" sql_upper = sql.upper().strip() if sql_upper.startswith('WITH') or sql_upper.startswith('SELECT'): return 'SELECT' elif self.PATTERNS['insert'].search(sql): return 'INSERT' elif self.PATTERNS['update'].search(sql): return 'UPDATE' elif self.PATTERNS['delete'].search(sql): return 'DELETE' elif self.PATTERNS['merge'].search(sql): return 'MERGE' elif self.PATTERNS['create'].search(sql): return 'CREATE' else: return 'UNKNOWN' def _extract_tables(self, sql: str) -> List[str]: """Extract table names from SQL""" tables = [] # FROM clause tables from_pattern = re.compile( r'\bFROM\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)', re.IGNORECASE ) tables.extend(from_pattern.findall(sql)) # JOIN clause tables join_pattern = re.compile( r'\bJOIN\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)', re.IGNORECASE ) tables.extend(join_pattern.findall(sql)) # INSERT INTO table insert_pattern = re.compile( r'\bINSERT\s+INTO\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)', re.IGNORECASE ) tables.extend(insert_pattern.findall(sql)) # UPDATE table update_pattern = re.compile( r'\bUPDATE\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)', re.IGNORECASE ) tables.extend(update_pattern.findall(sql)) return list(set(tables)) def _extract_columns(self, sql: str) -> List[str]: """Extract column references from SELECT clause""" # Find SELECT ... FROM match = re.search(r'\bSELECT\s+(.*?)\s+FROM\b', sql, re.IGNORECASE | re.DOTALL) if not match: return [] select_clause = match.group(1) # Handle SELECT * if '*' in select_clause and 'COUNT(*)' not in select_clause.upper(): return ['*'] # Extract column names (simplified) columns = [] for part in select_clause.split(','): part = part.strip() # Handle aliases alias_match = re.search(r'\bAS\s+(\w+)\s*$', part, re.IGNORECASE) if alias_match: columns.append(alias_match.group(1)) else: # Get the last identifier col_match = re.search(r'([a-zA-Z_][a-zA-Z0-9_]*)(?:\s*$|\s+AS\b)', part, re.IGNORECASE) if col_match: columns.append(col_match.group(1)) return columns def _extract_joins(self, sql: str) -> List[Dict[str, str]]: """Extract join information""" joins = [] join_pattern = re.compile( r'\b(INNER|LEFT\s+OUTER?|RIGHT\s+OUTER?|FULL\s+OUTER?|CROSS)?\s*JOIN\s+' r'([a-zA-Z_][a-zA-Z0-9_.]*)\s*(?:AS\s+)?(\w+)?\s*' r'(?:ON\s+(.+?))?(?=\s+(?:INNER|LEFT|RIGHT|FULL|CROSS|WHERE|GROUP|ORDER|HAVING|LIMIT|$))', re.IGNORECASE | re.DOTALL ) for match in join_pattern.finditer(sql): join_type = match.group(1) or 'INNER' table = match.group(2) alias = match.group(3) condition = match.group(4) joins.append({ 'type': join_type.strip().upper(), 'table': table, 'alias': alias, 'condition': condition.strip() if condition else None }) return joins def _extract_where_conditions(self, sql: str) -> List[str]: """Extract WHERE clause conditions""" # Find WHERE ... (GROUP BY | ORDER BY | HAVING | LIMIT | end) match = re.search( r'\bWHERE\s+(.*?)(?=\s+(?:GROUP\s+BY|ORDER\s+BY|HAVING|LIMIT)|$)', sql, re.IGNORECASE | re.DOTALL ) if not match: return [] where_clause = match.group(1).strip() # Split by AND/OR (simplified) conditions = re.split(r'\s+AND\s+|\s+OR\s+', where_clause, flags=re.IGNORECASE) return [c.strip() for c in conditions if c.strip()] def _extract_group_by(self, sql: str) -> List[str]: """Extract GROUP BY columns""" match = re.search( r'\bGROUP\s+BY\s+(.*?)(?=\s+(?:HAVING|ORDER\s+BY|LIMIT)|$)', sql, re.IGNORECASE | re.DOTALL ) if not match: return [] group_clause = match.group(1).strip() columns = [c.strip() for c in group_clause.split(',')] return columns def _extract_order_by(self, sql: str) -> List[str]: """Extract ORDER BY columns""" match = re.search( r'\bORDER\s+BY\s+(.*?)(?=\s+LIMIT|$)', sql, re.IGNORECASE | re.DOTALL ) if not match: return [] order_clause = match.group(1).strip() columns = [c.strip() for c in order_clause.split(',')] return columns def _extract_aggregations(self, sql: str) -> List[str]: """Extract aggregation functions used""" agg_pattern = re.compile( r'\b(COUNT|SUM|AVG|MIN|MAX|STDDEV|VARIANCE|MEDIAN|PERCENTILE_CONT|PERCENTILE_DISC)\s*\(', re.IGNORECASE ) return list(set(m.upper() for m in agg_pattern.findall(sql))) def _extract_ctes(self, sql: str) -> List[str]: """Extract CTE names""" cte_pattern = re.compile( r'\bWITH\s+(\w+)\s+AS\s*\(|,\s*(\w+)\s+AS\s*\(', re.IGNORECASE ) ctes = [] for match in cte_pattern.finditer(sql): cte_name = match.group(1) or match.group(2) if cte_name: ctes.append(cte_name) return ctes def _extract_window_functions(self, sql: str) -> List[str]: """Extract window function patterns""" window_pattern = re.compile( r'\b(\w+)\s*\([^)]*\)\s+OVER\s*\(', re.IGNORECASE ) return list(set(m.upper() for m in window_pattern.findall(sql))) def _estimate_complexity(self, tables: List[str], joins: List[Dict], subqueries: int, aggregations: List[str], window_functions: List[str]) -> str: """Estimate query complexity""" score = 0 # Table count score += len(tables) * 10 # Join count and types for join in joins: if join['type'] in ('CROSS', 'FULL OUTER'): score += 30 elif join['type'] in ('LEFT OUTER', 'RIGHT OUTER'): score += 20 else: score += 15 # Subqueries score += subqueries * 25 # Aggregations score += len(aggregations) * 5 # Window functions score += len(window_functions) * 15 if score < 30: return 'low' elif score < 60: return 'medium' elif score < 100: return 'high' else: return 'very_high' # ============================================================================= # SQL Optimizer # ============================================================================= class SQLOptimizer: """Analyze SQL queries and provide optimization recommendations""" def analyze(self, query_info: SQLQueryInfo, sql: str) -> List[OptimizationRecommendation]: """Analyze a SQL query and generate optimization recommendations""" recommendations = [] # Check for SELECT * if '*' in query_info.columns: recommendations.append(self._recommend_explicit_columns()) # Check for missing WHERE clause on large tables if not query_info.where_conditions and query_info.tables: recommendations.append(self._recommend_add_filters()) # Check for inefficient joins join_recs = self._analyze_joins(query_info) recommendations.extend(join_recs) # Check for DISTINCT usage if query_info.distinct: recommendations.append(self._recommend_distinct_alternative()) # Check for ORDER BY without LIMIT if query_info.order_by and not query_info.limit: recommendations.append(self._recommend_add_limit()) # Check for subquery optimization if query_info.subqueries > 0: recommendations.append(self._recommend_cte_conversion()) # Check for index opportunities index_recs = self._analyze_index_opportunities(query_info) recommendations.extend(index_recs) # Check for partition pruning partition_recs = self._analyze_partition_pruning(query_info, sql) recommendations.extend(partition_recs) # Check for aggregation optimization if query_info.aggregations and query_info.group_by: agg_recs = self._analyze_aggregation(query_info) recommendations.extend(agg_recs) # Sort by priority recommendations.sort(key=lambda r: r.priority) return recommendations def _recommend_explicit_columns(self) -> OptimizationRecommendation: return OptimizationRecommendation( category="query_structure", severity="medium", title="Avoid SELECT *", description="Using SELECT * retrieves all columns, increasing I/O and memory usage.", current_issue="Query uses SELECT * which fetches unnecessary columns", recommendation="Specify only the columns you need", expected_improvement="10-50% reduction in data scanned depending on table width", implementation="Replace SELECT * with SELECT col1, col2, col3 ...", priority=2 ) def _recommend_add_filters(self) -> OptimizationRecommendation: return OptimizationRecommendation( category="filter", severity="high", title="Add WHERE Clause Filters", description="Query scans entire tables without filtering, causing full table scans.", current_issue="No WHERE clause filters found - full table scan required", recommendation="Add appropriate WHERE conditions to filter data early", expected_improvement="Up to 90%+ reduction in data processed if highly selective", implementation="Add WHERE column = value or WHERE date_column >= '2024-01-01'", priority=1 ) def _analyze_joins(self, query_info: SQLQueryInfo) -> List[OptimizationRecommendation]: """Analyze joins for optimization opportunities""" recommendations = [] for join in query_info.joins: # Check for CROSS JOIN if join['type'] == 'CROSS': recommendations.append(OptimizationRecommendation( category="join", severity="critical", title="Avoid CROSS JOIN", description="CROSS JOIN creates a Cartesian product, which can explode data volume.", current_issue=f"CROSS JOIN with table {join['table']} detected", recommendation="Replace with appropriate INNER/LEFT JOIN with ON condition", expected_improvement="Exponential reduction in intermediate data", implementation=f"Convert CROSS JOIN {join['table']} to INNER JOIN {join['table']} ON ...", priority=1 )) # Check for missing join condition if not join.get('condition'): recommendations.append(OptimizationRecommendation( category="join", severity="high", title="Missing Join Condition", description="Join without explicit ON condition may cause Cartesian product.", current_issue=f"JOIN with {join['table']} has no explicit ON condition", recommendation="Add explicit ON condition to the join", expected_improvement="Prevents accidental Cartesian products", implementation=f"Add ON {join['table']}.id = other_table.foreign_key", priority=1 )) # Check for many joins if len(query_info.joins) > 5: recommendations.append(OptimizationRecommendation( category="join", severity="medium", title="High Number of Joins", description="Many joins can lead to complex execution plans and performance issues.", current_issue=f"{len(query_info.joins)} joins detected in single query", recommendation="Consider breaking into smaller queries or pre-aggregating", expected_improvement="Better plan optimization and memory usage", implementation="Use CTEs to materialize intermediate results, or denormalize frequently joined data", priority=3 )) return recommendations def _recommend_distinct_alternative(self) -> OptimizationRecommendation: return OptimizationRecommendation( category="query_structure", severity="medium", title="Consider Alternatives to DISTINCT", description="DISTINCT requires sorting/hashing all rows which can be expensive.", current_issue="DISTINCT used - may indicate data quality or join issues", recommendation="Review if DISTINCT is necessary or if joins produce duplicates", expected_improvement="Eliminates expensive deduplication step if not needed", implementation="Review join conditions, or use GROUP BY if aggregating anyway", priority=3 ) def _recommend_add_limit(self) -> OptimizationRecommendation: return OptimizationRecommendation( category="query_structure", severity="low", title="Add LIMIT to ORDER BY", description="ORDER BY without LIMIT sorts entire result set unnecessarily.", current_issue="ORDER BY present without LIMIT clause", recommendation="Add LIMIT if only top N rows are needed", expected_improvement="Significant reduction in sorting overhead for large results", implementation="Add LIMIT 100 (or appropriate number) after ORDER BY", priority=4 ) def _recommend_cte_conversion(self) -> OptimizationRecommendation: return OptimizationRecommendation( category="query_structure", severity="medium", title="Convert Subqueries to CTEs", description="Subqueries can be harder to optimize and maintain than CTEs.", current_issue="Subqueries detected in the query", recommendation="Convert correlated subqueries to CTEs or JOINs", expected_improvement="Better query plan optimization and readability", implementation="WITH subquery_name AS (SELECT ...) SELECT ... FROM main_table JOIN subquery_name", priority=3 ) def _analyze_index_opportunities(self, query_info: SQLQueryInfo) -> List[OptimizationRecommendation]: """Identify potential index opportunities""" recommendations = [] # Columns in WHERE clause are index candidates where_columns = set() for condition in query_info.where_conditions: # Extract column names from conditions col_pattern = re.compile(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*(?:=|>|<|>=|<=|<>|!=|LIKE|IN|BETWEEN)', re.IGNORECASE) where_columns.update(col_pattern.findall(condition)) if where_columns: recommendations.append(OptimizationRecommendation( category="index", severity="medium", title="Consider Indexes on Filter Columns", description="Columns used in WHERE clauses benefit from indexes.", current_issue=f"Filter columns detected: {', '.join(where_columns)}", recommendation="Create indexes on frequently filtered columns", expected_improvement="Orders of magnitude faster for selective queries", implementation=f"CREATE INDEX idx_name ON table ({', '.join(list(where_columns)[:3])})", priority=2 )) # JOIN columns are index candidates join_columns = set() for join in query_info.joins: if join.get('condition'): col_pattern = re.compile(r'\.([a-zA-Z_][a-zA-Z0-9_]*)\s*=', re.IGNORECASE) join_columns.update(col_pattern.findall(join['condition'])) if join_columns: recommendations.append(OptimizationRecommendation( category="index", severity="high", title="Index Join Columns", description="Join columns without indexes cause expensive full table scans.", current_issue=f"Join columns detected: {', '.join(join_columns)}", recommendation="Ensure indexes exist on join key columns", expected_improvement="Dramatic improvement in join performance", implementation=f"CREATE INDEX idx_join ON table ({list(join_columns)[0]})", priority=1 )) return recommendations def _analyze_partition_pruning(self, query_info: SQLQueryInfo, sql: str) -> List[OptimizationRecommendation]: """Check for partition pruning opportunities""" recommendations = [] # Look for date/time columns in WHERE clause date_pattern = re.compile( r'\b(date|time|timestamp|created|updated|modified)_?\w*\s*(?:=|>|<|>=|<=|BETWEEN)', re.IGNORECASE ) if date_pattern.search(sql): recommendations.append(OptimizationRecommendation( category="partition", severity="medium", title="Leverage Partition Pruning", description="Date-based filters can leverage partitioned tables for massive speedups.", current_issue="Date/time filter detected - ensure table is partitioned", recommendation="Partition table by date column and ensure filter format matches", expected_improvement="90%+ reduction in data scanned for time-bounded queries", implementation="CREATE TABLE ... PARTITION BY RANGE (date_column) or use dynamic partitioning", priority=2 )) return recommendations def _analyze_aggregation(self, query_info: SQLQueryInfo) -> List[OptimizationRecommendation]: """Analyze aggregation patterns""" recommendations = [] # High cardinality GROUP BY warning if len(query_info.group_by) > 3: recommendations.append(OptimizationRecommendation( category="aggregation", severity="medium", title="High Cardinality GROUP BY", description="Grouping by many columns increases memory usage and reduces aggregation benefit.", current_issue=f"GROUP BY with {len(query_info.group_by)} columns detected", recommendation="Review if all group by columns are necessary", expected_improvement="Reduced memory and faster aggregation", implementation="Remove non-essential GROUP BY columns or pre-aggregate", priority=3 )) # COUNT DISTINCT optimization if 'COUNT' in query_info.aggregations and query_info.distinct: recommendations.append(OptimizationRecommendation( category="aggregation", severity="medium", title="Optimize COUNT DISTINCT", description="COUNT DISTINCT can be expensive for high cardinality columns.", current_issue="COUNT DISTINCT pattern detected", recommendation="Consider HyperLogLog approximation for very large datasets", expected_improvement="Massive speedup with ~2% error tolerance", implementation="Use APPROX_COUNT_DISTINCT() if available in your warehouse", priority=3 )) return recommendations # ============================================================================= # Spark Job Analyzer # ============================================================================= class SparkJobAnalyzer: """Analyze Spark job metrics and provide optimization recommendations""" def analyze(self, metrics: SparkJobMetrics) -> List[OptimizationRecommendation]: """Analyze Spark job metrics""" recommendations = [] # Check for data skew if metrics.skew_ratio > 5: recommendations.append(self._recommend_skew_mitigation(metrics)) # Check for excessive shuffle shuffle_ratio = metrics.shuffle_write_bytes / max(metrics.input_bytes, 1) if shuffle_ratio > 1.5: recommendations.append(self._recommend_reduce_shuffle(metrics, shuffle_ratio)) # Check for GC overhead gc_ratio = metrics.gc_time_ms / max(metrics.duration_ms, 1) if gc_ratio > 0.1: recommendations.append(self._recommend_memory_tuning(metrics, gc_ratio)) # Check for failed tasks if metrics.failed_tasks > 0: fail_ratio = metrics.failed_tasks / max(metrics.tasks, 1) recommendations.append(self._recommend_failure_handling(metrics, fail_ratio)) # Check for speculative execution overhead if metrics.speculative_tasks > metrics.tasks * 0.1: recommendations.append(self._recommend_reduce_speculation(metrics)) # Check task count if metrics.tasks > 10000: recommendations.append(self._recommend_reduce_tasks(metrics)) elif metrics.tasks < 10 and metrics.input_bytes > 1e9: recommendations.append(self._recommend_increase_parallelism(metrics)) return recommendations def _recommend_skew_mitigation(self, metrics: SparkJobMetrics) -> OptimizationRecommendation: return OptimizationRecommendation( category="skew", severity="critical", title="Severe Data Skew Detected", description=f"Skew ratio of {metrics.skew_ratio:.1f}x indicates uneven data distribution.", current_issue=f"Task execution time varies by {metrics.skew_ratio:.1f}x, causing stragglers", recommendation="Apply skew handling techniques to rebalance data", expected_improvement="Up to 80% reduction in job time by eliminating stragglers", implementation="""Options: 1. Salting: Add random prefix to skewed keys df.withColumn("salted_key", concat(col("key"), lit("_"), (rand() * 10).cast("int"))) 2. Broadcast join for small tables: df1.join(broadcast(df2), "key") 3. Adaptive Query Execution (Spark 3.0+): spark.conf.set("spark.sql.adaptive.enabled", "true") spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")""", priority=1 ) def _recommend_reduce_shuffle(self, metrics: SparkJobMetrics, ratio: float) -> OptimizationRecommendation: return OptimizationRecommendation( category="shuffle", severity="high", title="Excessive Shuffle Data", description=f"Shuffle writes {ratio:.1f}x the input data size.", current_issue=f"Shuffle write: {metrics.shuffle_write_bytes / 1e9:.2f} GB vs input: {metrics.input_bytes / 1e9:.2f} GB", recommendation="Reduce shuffle through partitioning and early aggregation", expected_improvement="Significant network I/O and storage reduction", implementation="""Options: 1. Pre-aggregate before shuffle: df.groupBy("key").agg(sum("value")).repartition("key") 2. Use map-side combining: df.reduceByKey((a, b) => a + b) 3. Optimize partition count: spark.conf.set("spark.sql.shuffle.partitions", optimal_count) 4. Use bucketing for repeated joins: df.write.bucketBy(200, "key").saveAsTable("bucketed_table")""", priority=1 ) def _recommend_memory_tuning(self, metrics: SparkJobMetrics, gc_ratio: float) -> OptimizationRecommendation: return OptimizationRecommendation( category="memory", severity="high", title="High GC Overhead", description=f"GC time is {gc_ratio * 100:.1f}% of total execution time.", current_issue=f"GC time: {metrics.gc_time_ms / 1000:.1f}s out of {metrics.duration_ms / 1000:.1f}s total", recommendation="Tune memory settings to reduce garbage collection", expected_improvement="20-50% faster execution with proper memory config", implementation="""Memory tuning options: 1. Increase executor memory: --executor-memory 8g 2. Adjust memory fractions: spark.memory.fraction=0.6 spark.memory.storageFraction=0.5 3. Use off-heap memory: spark.memory.offHeap.enabled=true spark.memory.offHeap.size=4g 4. Reduce cached data: df.unpersist() when no longer needed 5. Use Kryo serialization: spark.serializer=org.apache.spark.serializer.KryoSerializer""", priority=2 ) def _recommend_failure_handling(self, metrics: SparkJobMetrics, fail_ratio: float) -> OptimizationRecommendation: return OptimizationRecommendation( category="reliability", severity="high" if fail_ratio > 0.1 else "medium", title="Task Failures Detected", description=f"{metrics.failed_tasks} tasks failed ({fail_ratio * 100:.1f}% failure rate).", current_issue="Task failures increase job time and resource usage due to retries", recommendation="Investigate failure causes and add resilience", expected_improvement="Reduced retries and more predictable job times", implementation="""Failure handling options: 1. Check executor logs for OOM: spark.executor.memoryOverhead=2g 2. Handle data issues: df.filter(col("value").isNotNull()) 3. Increase task retries: spark.task.maxFailures=4 4. Add checkpointing for long jobs: df.checkpoint() 5. Check for network timeouts: spark.network.timeout=300s""", priority=1 ) def _recommend_reduce_speculation(self, metrics: SparkJobMetrics) -> OptimizationRecommendation: return OptimizationRecommendation( category="execution", severity="medium", title="High Speculative Execution", description=f"{metrics.speculative_tasks} speculative tasks launched.", current_issue="Excessive speculation wastes resources and indicates underlying issues", recommendation="Address root cause of slow tasks instead of speculation", expected_improvement="Better resource utilization", implementation="""Options: 1. Disable speculation if not needed: spark.speculation=false 2. Or tune speculation settings: spark.speculation.multiplier=1.5 spark.speculation.quantile=0.9 3. Fix underlying skew/memory issues first""", priority=3 ) def _recommend_reduce_tasks(self, metrics: SparkJobMetrics) -> OptimizationRecommendation: return OptimizationRecommendation( category="parallelism", severity="medium", title="Too Many Tasks", description=f"{metrics.tasks} tasks may cause excessive scheduling overhead.", current_issue="Very high task count increases driver overhead", recommendation="Reduce partition count for better efficiency", expected_improvement="Reduced scheduling overhead and driver memory usage", implementation=f""" 1. Reduce shuffle partitions: spark.sql.shuffle.partitions={max(200, metrics.tasks // 10)} 2. Coalesce partitions: df.coalesce({max(200, metrics.tasks // 10)}) 3. Use adaptive partitioning (Spark 3.0+): spark.sql.adaptive.enabled=true""", priority=3 ) def _recommend_increase_parallelism(self, metrics: SparkJobMetrics) -> OptimizationRecommendation: recommended_partitions = max(200, int(metrics.input_bytes / (128 * 1e6))) # 128MB per partition return OptimizationRecommendation( category="parallelism", severity="high", title="Low Parallelism", description=f"Only {metrics.tasks} tasks for {metrics.input_bytes / 1e9:.2f} GB of data.", current_issue="Under-utilization of cluster resources", recommendation="Increase parallelism to better utilize cluster", expected_improvement="Linear speedup with added parallelism", implementation=f""" 1. Increase shuffle partitions: spark.sql.shuffle.partitions={recommended_partitions} 2. Repartition input: df.repartition({recommended_partitions}) 3. Adjust default parallelism: spark.default.parallelism={recommended_partitions}""", priority=2 ) # ============================================================================= # Partition Strategy Advisor # ============================================================================= class PartitionAdvisor: """Recommend partitioning strategies based on data characteristics""" def recommend(self, data_stats: Dict) -> List[PartitionStrategy]: """Generate partition recommendations from data statistics""" recommendations = [] columns = data_stats.get('columns', {}) total_size_bytes = data_stats.get('total_size_bytes', 0) row_count = data_stats.get('row_count', 0) for col_name, col_stats in columns.items(): strategy = self._evaluate_column(col_name, col_stats, total_size_bytes, row_count) if strategy: recommendations.append(strategy) # Sort by partition effectiveness recommendations.sort(key=lambda s: s.partition_size_mb) return recommendations[:3] # Top 3 recommendations def _evaluate_column(self, col_name: str, col_stats: Dict, total_size_bytes: int, row_count: int) -> Optional[PartitionStrategy]: """Evaluate a column for partitioning potential""" cardinality = col_stats.get('cardinality', 0) data_type = col_stats.get('data_type', 'string') null_percentage = col_stats.get('null_percentage', 0) # Skip high-null columns if null_percentage > 20: return None # Date/timestamp columns are ideal for range partitioning if data_type in ('date', 'timestamp', 'datetime'): return self._recommend_date_partition(col_name, col_stats, total_size_bytes, row_count) # Low cardinality columns are good for list partitioning if cardinality and cardinality <= 100: return self._recommend_list_partition(col_name, col_stats, total_size_bytes, cardinality) # Medium cardinality columns can use hash partitioning if cardinality and 100 < cardinality <= 10000: return self._recommend_hash_partition(col_name, col_stats, total_size_bytes) return None def _recommend_date_partition(self, col_name: str, col_stats: Dict, total_size_bytes: int, row_count: int) -> PartitionStrategy: # Estimate daily partition size (assume 365 days of data) estimated_days = 365 partition_size_mb = (total_size_bytes / estimated_days) / (1024 * 1024) return PartitionStrategy( column=col_name, partition_type="range", num_partitions=None, # Dynamic based on date range partition_size_mb=partition_size_mb, reasoning=f"Date column '{col_name}' is ideal for range partitioning. " f"Estimated daily partition size: {partition_size_mb:.1f} MB", implementation=f""" -- BigQuery CREATE TABLE table_name PARTITION BY DATE({col_name}) AS SELECT * FROM source_table; -- Snowflake CREATE TABLE table_name CLUSTER BY (DATE_TRUNC('DAY', {col_name})); -- Spark/Hive df.write.partitionBy("{col_name}").parquet("path") -- PostgreSQL CREATE TABLE table_name (...) PARTITION BY RANGE ({col_name});""" ) def _recommend_list_partition(self, col_name: str, col_stats: Dict, total_size_bytes: int, cardinality: int) -> PartitionStrategy: partition_size_mb = (total_size_bytes / cardinality) / (1024 * 1024) return PartitionStrategy( column=col_name, partition_type="list", num_partitions=cardinality, partition_size_mb=partition_size_mb, reasoning=f"Column '{col_name}' has {cardinality} distinct values - ideal for list partitioning. " f"Estimated partition size: {partition_size_mb:.1f} MB", implementation=f""" -- Spark/Hive df.write.partitionBy("{col_name}").parquet("path") -- PostgreSQL CREATE TABLE table_name (...) PARTITION BY LIST ({col_name}); -- Note: List partitioning works best with stable, low-cardinality values""" ) def _recommend_hash_partition(self, col_name: str, col_stats: Dict, total_size_bytes: int) -> PartitionStrategy: # Target ~128MB partitions target_partition_size = 128 * 1024 * 1024 num_partitions = max(1, int(total_size_bytes / target_partition_size)) # Round to power of 2 for better distribution num_partitions = 2 ** int(math.log2(num_partitions) + 0.5) partition_size_mb = (total_size_bytes / num_partitions) / (1024 * 1024) return PartitionStrategy( column=col_name, partition_type="hash", num_partitions=num_partitions, partition_size_mb=partition_size_mb, reasoning=f"Column '{col_name}' has medium cardinality - hash partitioning provides even distribution. " f"Recommended {num_partitions} partitions (~{partition_size_mb:.1f} MB each)", implementation=f""" -- Spark df.repartition({num_partitions}, col("{col_name}")) -- PostgreSQL CREATE TABLE table_name (...) PARTITION BY HASH ({col_name}); -- Snowflake (clustering) ALTER TABLE table_name CLUSTER BY ({col_name});""" ) # ============================================================================= # Cost Estimator # ============================================================================= class CostEstimator: """Estimate query costs for cloud data warehouses""" # Pricing (approximate, varies by region and contract) PRICING = { 'snowflake': { 'compute_per_credit': 2.00, # USD per credit 'credits_per_hour': { 'x-small': 1, 'small': 2, 'medium': 4, 'large': 8, 'x-large': 16, }, 'storage_per_tb_month': 23.00, }, 'bigquery': { 'on_demand_per_tb': 5.00, # USD per TB scanned 'storage_per_tb_month': 20.00, 'streaming_insert_per_gb': 0.01, }, 'redshift': { 'dc2_large_per_hour': 0.25, 'ra3_xlarge_per_hour': 1.086, 'storage_per_gb_month': 0.024, }, 'databricks': { 'dbu_per_hour_sql': 0.22, 'dbu_per_hour_jobs': 0.15, } } def estimate(self, query_info: SQLQueryInfo, warehouse: str, data_stats: Optional[Dict] = None) -> CostEstimate: """Estimate query cost""" warehouse = warehouse.lower() if warehouse not in self.PRICING: raise ValueError(f"Unknown warehouse: {warehouse}. Supported: {list(self.PRICING.keys())}") # Estimate data scanned data_scanned_bytes = self._estimate_data_scanned(query_info, data_stats) data_scanned_tb = data_scanned_bytes / (1024 ** 4) if warehouse == 'bigquery': return self._estimate_bigquery(query_info, data_scanned_tb, data_stats) elif warehouse == 'snowflake': return self._estimate_snowflake(query_info, data_scanned_tb, data_stats) elif warehouse == 'redshift': return self._estimate_redshift(query_info, data_scanned_tb, data_stats) elif warehouse == 'databricks': return self._estimate_databricks(query_info, data_scanned_tb, data_stats) def _estimate_data_scanned(self, query_info: SQLQueryInfo, data_stats: Optional[Dict]) -> int: """Estimate bytes of data that will be scanned""" if data_stats and 'total_size_bytes' in data_stats: base_size = data_stats['total_size_bytes'] else: # Default assumption: 1GB per table base_size = len(query_info.tables) * 1e9 # Adjust for filters filter_factor = 1.0 if query_info.where_conditions: # Assume each filter reduces data by 50% (very rough) filter_factor = 0.5 ** min(len(query_info.where_conditions), 3) # Adjust for column projection if '*' not in query_info.columns and query_info.columns: # Assume selecting specific columns reduces scan by 50% filter_factor *= 0.5 return int(base_size * filter_factor) def _estimate_bigquery(self, query_info: SQLQueryInfo, data_scanned_tb: float, data_stats: Optional[Dict]) -> CostEstimate: pricing = self.PRICING['bigquery'] compute_cost = data_scanned_tb * pricing['on_demand_per_tb'] # Minimum billing of 10MB if data_scanned_tb < 10 / (1024 ** 2): compute_cost = 10 / (1024 ** 2) * pricing['on_demand_per_tb'] return CostEstimate( warehouse='BigQuery', compute_cost=compute_cost, storage_cost=0, # Storage cost separate data_transfer_cost=0, total_cost=compute_cost, assumptions=[ f"Estimated {data_scanned_tb * 1024:.2f} GB data scanned", "Using on-demand pricing ($5/TB)", "Assumes no slot reservations", "Actual cost depends on partitioning and clustering" ] ) def _estimate_snowflake(self, query_info: SQLQueryInfo, data_scanned_tb: float, data_stats: Optional[Dict]) -> CostEstimate: pricing = self.PRICING['snowflake'] # Estimate warehouse size and time complexity_to_size = { 'low': 'x-small', 'medium': 'small', 'high': 'medium', 'very_high': 'large' } warehouse_size = complexity_to_size.get(query_info.estimated_complexity, 'small') credits_per_hour = pricing['credits_per_hour'][warehouse_size] # Estimate runtime (very rough) estimated_seconds = max(1, data_scanned_tb * 1024 * 10) # 10 seconds per GB estimated_hours = estimated_seconds / 3600 credits_used = credits_per_hour * estimated_hours compute_cost = credits_used * pricing['compute_per_credit'] # Minimum 1 minute billing min_cost = (credits_per_hour / 60) * pricing['compute_per_credit'] compute_cost = max(compute_cost, min_cost) return CostEstimate( warehouse='Snowflake', compute_cost=compute_cost, storage_cost=0, data_transfer_cost=0, total_cost=compute_cost, assumptions=[ f"Warehouse size: {warehouse_size}", f"Estimated runtime: {estimated_seconds:.1f} seconds", f"Credits used: {credits_used:.4f}", "Minimum 1-minute billing applies", "Actual cost depends on warehouse auto-suspend settings" ] ) def _estimate_redshift(self, query_info: SQLQueryInfo, data_scanned_tb: float, data_stats: Optional[Dict]) -> CostEstimate: pricing = self.PRICING['redshift'] # Assume RA3 xl node type hourly_rate = pricing['ra3_xlarge_per_hour'] # Estimate runtime estimated_seconds = max(1, data_scanned_tb * 1024 * 15) # 15 seconds per GB estimated_hours = estimated_seconds / 3600 compute_cost = hourly_rate * estimated_hours return CostEstimate( warehouse='Redshift', compute_cost=compute_cost, storage_cost=0, data_transfer_cost=0, total_cost=compute_cost, assumptions=[ f"Using RA3.xlplus node type", f"Estimated runtime: {estimated_seconds:.1f} seconds", "Assumes dedicated cluster (not serverless)", "Actual cost depends on cluster configuration" ] ) def _estimate_databricks(self, query_info: SQLQueryInfo, data_scanned_tb: float, data_stats: Optional[Dict]) -> CostEstimate: pricing = self.PRICING['databricks'] # Estimate DBUs estimated_seconds = max(1, data_scanned_tb * 1024 * 12) estimated_hours = estimated_seconds / 3600 dbu_cost = pricing['dbu_per_hour_sql'] * estimated_hours return CostEstimate( warehouse='Databricks', compute_cost=dbu_cost, storage_cost=0, data_transfer_cost=0, total_cost=dbu_cost, assumptions=[ f"Using SQL warehouse", f"Estimated runtime: {estimated_seconds:.1f} seconds", "DBU rate may vary by workspace tier", "Does not include underlying cloud costs" ] ) # ============================================================================= # Report Generator # ============================================================================= class ReportGenerator: """Generate optimization reports""" def generate_text_report(self, query_info: SQLQueryInfo, recommendations: List[OptimizationRecommendation], cost_estimate: Optional[CostEstimate] = None) -> str: """Generate a text report""" lines = [] lines.append("=" * 80) lines.append("ETL PERFORMANCE OPTIMIZATION REPORT") lines.append("=" * 80) lines.append(f"\nGenerated: {datetime.now().isoformat()}") # Query summary lines.append("\n" + "-" * 40) lines.append("QUERY ANALYSIS") lines.append("-" * 40) lines.append(f"Query Type: {query_info.query_type}") lines.append(f"Tables: {', '.join(query_info.tables) or 'None'}") lines.append(f"Joins: {len(query_info.joins)}") lines.append(f"Subqueries: {query_info.subqueries}") lines.append(f"Aggregations: {', '.join(query_info.aggregations) or 'None'}") lines.append(f"Window Functions: {', '.join(query_info.window_functions) or 'None'}") lines.append(f"Complexity: {query_info.estimated_complexity.upper()}") # Cost estimate if cost_estimate: lines.append("\n" + "-" * 40) lines.append("COST ESTIMATE") lines.append("-" * 40) lines.append(f"Warehouse: {cost_estimate.warehouse}") lines.append(f"Estimated Cost: ${cost_estimate.total_cost:.4f} {cost_estimate.currency}") lines.append("Assumptions:") for assumption in cost_estimate.assumptions: lines.append(f" - {assumption}") # Recommendations if recommendations: lines.append("\n" + "-" * 40) lines.append(f"OPTIMIZATION RECOMMENDATIONS ({len(recommendations)} found)") lines.append("-" * 40) for i, rec in enumerate(recommendations, 1): severity_icon = { 'critical': 'šŸ”“', 'high': '🟠', 'medium': '🟔', 'low': '🟢' }.get(rec.severity, '⚪') lines.append(f"\n{i}. {severity_icon} [{rec.severity.upper()}] {rec.title}") lines.append(f" Category: {rec.category}") lines.append(f" Issue: {rec.current_issue}") lines.append(f" Recommendation: {rec.recommendation}") lines.append(f" Expected Improvement: {rec.expected_improvement}") lines.append(f"\n Implementation:") for impl_line in rec.implementation.strip().split('\n'): lines.append(f" {impl_line}") else: lines.append("\nāœ… No optimization issues detected") lines.append("\n" + "=" * 80) return "\n".join(lines) def generate_json_report(self, query_info: SQLQueryInfo, recommendations: List[OptimizationRecommendation], cost_estimate: Optional[CostEstimate] = None) -> Dict: """Generate a JSON report""" return { "report_type": "etl_performance_optimization", "generated_at": datetime.now().isoformat(), "query_analysis": { "query_type": query_info.query_type, "tables": query_info.tables, "joins": query_info.joins, "subqueries": query_info.subqueries, "aggregations": query_info.aggregations, "window_functions": query_info.window_functions, "complexity": query_info.estimated_complexity }, "cost_estimate": asdict(cost_estimate) if cost_estimate else None, "recommendations": [asdict(r) for r in recommendations], "summary": { "total_recommendations": len(recommendations), "critical": sum(1 for r in recommendations if r.severity == "critical"), "high": sum(1 for r in recommendations if r.severity == "high"), "medium": sum(1 for r in recommendations if r.severity == "medium"), "low": sum(1 for r in recommendations if r.severity == "low") } } # ============================================================================= # CLI Commands # ============================================================================= def cmd_analyze_sql(args): """Analyze SQL query for optimization opportunities""" # Load SQL sql_path = Path(args.input) if sql_path.exists(): with open(sql_path, 'r') as f: sql = f.read() else: sql = args.input # Treat as inline SQL # Parse and analyze parser = SQLParser() query_info = parser.parse(sql) optimizer = SQLOptimizer() recommendations = optimizer.analyze(query_info, sql) # Cost estimate if warehouse specified cost_estimate = None if args.warehouse: estimator = CostEstimator() data_stats = None if args.stats: with open(args.stats, 'r') as f: data_stats = json.load(f) cost_estimate = estimator.estimate(query_info, args.warehouse, data_stats) # Generate report reporter = ReportGenerator() if args.json: report = reporter.generate_json_report(query_info, recommendations, cost_estimate) output = json.dumps(report, indent=2) else: output = reporter.generate_text_report(query_info, recommendations, cost_estimate) if args.output: with open(args.output, 'w') as f: f.write(output) logger.info(f"Report saved to {args.output}") else: print(output) def cmd_analyze_spark(args): """Analyze Spark job metrics""" with open(args.input, 'r') as f: metrics_data = json.load(f) # Handle both single job and array of jobs if isinstance(metrics_data, list): jobs = metrics_data else: jobs = [metrics_data] all_recommendations = [] analyzer = SparkJobAnalyzer() for job_data in jobs: metrics = SparkJobMetrics( job_id=job_data.get('jobId', 'unknown'), duration_ms=job_data.get('duration', 0), stages=job_data.get('numStages', 0), tasks=job_data.get('numTasks', 0), shuffle_read_bytes=job_data.get('shuffleReadBytes', 0), shuffle_write_bytes=job_data.get('shuffleWriteBytes', 0), input_bytes=job_data.get('inputBytes', 0), output_bytes=job_data.get('outputBytes', 0), peak_memory_bytes=job_data.get('peakMemoryBytes', 0), gc_time_ms=job_data.get('gcTime', 0), failed_tasks=job_data.get('failedTasks', 0), speculative_tasks=job_data.get('speculativeTasks', 0), skew_ratio=job_data.get('skewRatio', 1.0) ) recommendations = analyzer.analyze(metrics) all_recommendations.extend(recommendations) # Deduplicate similar recommendations unique_recs = [] seen_titles = set() for rec in all_recommendations: if rec.title not in seen_titles: unique_recs.append(rec) seen_titles.add(rec.title) # Output if args.json: output = json.dumps([asdict(r) for r in unique_recs], indent=2) else: lines = [] lines.append("=" * 60) lines.append("SPARK JOB OPTIMIZATION REPORT") lines.append("=" * 60) lines.append(f"\nJobs Analyzed: {len(jobs)}") lines.append(f"Recommendations: {len(unique_recs)}") for i, rec in enumerate(unique_recs, 1): lines.append(f"\n{i}. [{rec.severity.upper()}] {rec.title}") lines.append(f" {rec.description}") lines.append(f" Implementation: {rec.implementation[:200]}...") output = "\n".join(lines) if args.output: with open(args.output, 'w') as f: f.write(output) else: print(output) def cmd_optimize_partition(args): """Recommend partition strategies""" with open(args.input, 'r') as f: data_stats = json.load(f) advisor = PartitionAdvisor() strategies = advisor.recommend(data_stats) if args.json: output = json.dumps([asdict(s) for s in strategies], indent=2) else: lines = [] lines.append("=" * 60) lines.append("PARTITION STRATEGY RECOMMENDATIONS") lines.append("=" * 60) if not strategies: lines.append("\nNo partition recommendations based on provided data statistics.") else: for i, strategy in enumerate(strategies, 1): lines.append(f"\n{i}. Partition by: {strategy.column}") lines.append(f" Type: {strategy.partition_type}") if strategy.num_partitions: lines.append(f" Partitions: {strategy.num_partitions}") lines.append(f" Estimated size: {strategy.partition_size_mb:.1f} MB per partition") lines.append(f" Reasoning: {strategy.reasoning}") lines.append(f"\n Implementation:") for impl_line in strategy.implementation.strip().split('\n'): lines.append(f" {impl_line}") output = "\n".join(lines) if args.output: with open(args.output, 'w') as f: f.write(output) else: print(output) def cmd_estimate_cost(args): """Estimate query cost""" # Load SQL sql_path = Path(args.input) if sql_path.exists(): with open(sql_path, 'r') as f: sql = f.read() else: sql = args.input # Parse parser = SQLParser() query_info = parser.parse(sql) # Load data stats if provided data_stats = None if args.stats: with open(args.stats, 'r') as f: data_stats = json.load(f) # Estimate cost estimator = CostEstimator() cost = estimator.estimate(query_info, args.warehouse, data_stats) if args.json: output = json.dumps(asdict(cost), indent=2) else: lines = [] lines.append(f"Cost Estimate for {cost.warehouse}") lines.append("=" * 40) lines.append(f"Compute Cost: ${cost.compute_cost:.4f}") lines.append(f"Storage Cost: ${cost.storage_cost:.4f}") lines.append(f"Data Transfer: ${cost.data_transfer_cost:.4f}") lines.append("-" * 40) lines.append(f"Total: ${cost.total_cost:.4f} {cost.currency}") lines.append("\nAssumptions:") for assumption in cost.assumptions: lines.append(f" - {assumption}") output = "\n".join(lines) if args.output: with open(args.output, 'w') as f: f.write(output) else: print(output) def cmd_generate_template(args): """Generate template files""" templates = { 'data_stats': { "total_size_bytes": 10737418240, "row_count": 10000000, "columns": { "id": { "data_type": "integer", "cardinality": 10000000, "null_percentage": 0 }, "created_at": { "data_type": "timestamp", "cardinality": 1000000, "null_percentage": 0 }, "category": { "data_type": "string", "cardinality": 50, "null_percentage": 2 }, "amount": { "data_type": "float", "cardinality": 100000, "null_percentage": 5 } } }, 'spark_metrics': { "jobId": "job_12345", "duration": 300000, "numStages": 5, "numTasks": 200, "shuffleReadBytes": 5368709120, "shuffleWriteBytes": 2147483648, "inputBytes": 10737418240, "outputBytes": 1073741824, "peakMemoryBytes": 4294967296, "gcTime": 15000, "failedTasks": 2, "speculativeTasks": 5, "skewRatio": 3.5 } } if args.template not in templates: logger.error(f"Unknown template: {args.template}. Available: {list(templates.keys())}") sys.exit(1) output = json.dumps(templates[args.template], indent=2) if args.output: with open(args.output, 'w') as f: f.write(output) logger.info(f"Template saved to {args.output}") else: print(output) def main(): """Main entry point""" parser = argparse.ArgumentParser( description="ETL Performance Optimizer - Analyze and optimize data pipelines", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Analyze SQL query python etl_performance_optimizer.py analyze-sql query.sql # Analyze with cost estimate python etl_performance_optimizer.py analyze-sql query.sql --warehouse bigquery # Analyze Spark job metrics python etl_performance_optimizer.py analyze-spark spark-history.json # Get partition recommendations python etl_performance_optimizer.py optimize-partition data_stats.json # Estimate query cost python etl_performance_optimizer.py estimate-cost query.sql --warehouse snowflake # Generate template files python etl_performance_optimizer.py template data_stats --output stats.json """ ) parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') subparsers = parser.add_subparsers(dest='command', help='Command to run') # Analyze SQL command sql_parser = subparsers.add_parser('analyze-sql', help='Analyze SQL query') sql_parser.add_argument('input', help='SQL file or inline query') sql_parser.add_argument('--warehouse', '-w', choices=['bigquery', 'snowflake', 'redshift', 'databricks'], help='Warehouse for cost estimation') sql_parser.add_argument('--stats', '-s', help='Data statistics JSON file') sql_parser.add_argument('--output', '-o', help='Output file') sql_parser.add_argument('--json', action='store_true', help='Output as JSON') sql_parser.set_defaults(func=cmd_analyze_sql) # Analyze Spark command spark_parser = subparsers.add_parser('analyze-spark', help='Analyze Spark job metrics') spark_parser.add_argument('input', help='Spark metrics JSON file') spark_parser.add_argument('--output', '-o', help='Output file') spark_parser.add_argument('--json', action='store_true', help='Output as JSON') spark_parser.set_defaults(func=cmd_analyze_spark) # Optimize partition command partition_parser = subparsers.add_parser('optimize-partition', help='Recommend partition strategies') partition_parser.add_argument('input', help='Data statistics JSON file') partition_parser.add_argument('--output', '-o', help='Output file') partition_parser.add_argument('--json', action='store_true', help='Output as JSON') partition_parser.set_defaults(func=cmd_optimize_partition) # Estimate cost command cost_parser = subparsers.add_parser('estimate-cost', help='Estimate query cost') cost_parser.add_argument('input', help='SQL file or inline query') cost_parser.add_argument('--warehouse', '-w', required=True, choices=['bigquery', 'snowflake', 'redshift', 'databricks'], help='Target warehouse') cost_parser.add_argument('--stats', '-s', help='Data statistics JSON file') cost_parser.add_argument('--output', '-o', help='Output file') cost_parser.add_argument('--json', action='store_true', help='Output as JSON') cost_parser.set_defaults(func=cmd_estimate_cost) # Template command template_parser = subparsers.add_parser('template', help='Generate template files') template_parser.add_argument('template', choices=['data_stats', 'spark_metrics'], help='Template type') template_parser.add_argument('--output', '-o', help='Output file') template_parser.set_defaults(func=cmd_generate_template) args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) if not args.command: parser.print_help() sys.exit(1) try: args.func(args) except Exception as e: logger.error(f"Error: {e}") if args.verbose: import traceback traceback.print_exc() sys.exit(1) if __name__ == '__main__': main()