claude-skills-reference/engineering/sql-database-assistant/scripts/query_optimizer.py

#!/usr/bin/env python3
"""
SQL Query Optimizer — Static Analysis

Analyzes SQL queries for common performance issues:
- SELECT * usage
- Missing WHERE clauses on UPDATE/DELETE
- Cartesian joins (missing JOIN conditions)
- Subqueries in SELECT list
- Missing LIMIT on unbounded SELECTs
- Function calls on indexed columns (non-sargable)
- LIKE with leading wildcard
- ORDER BY RAND()
- UNION instead of UNION ALL
- NOT IN with subquery (NULL-unsafe)

Usage:
    python query_optimizer.py --query "SELECT * FROM users"
    python query_optimizer.py --query queries.sql --dialect postgres
    python query_optimizer.py --query "SELECT * FROM orders" --json
"""

import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from typing import List, Optional


@dataclass
class Issue:
    """A single optimization issue found in a query."""
    severity: str  # critical, warning, info
    rule: str
    message: str
    suggestion: str
    line: Optional[int] = None


@dataclass
class QueryAnalysis:
    """Analysis result for one SQL query."""
    query: str
    issues: List[Issue]
    score: int  # 0-100, higher is better

    def to_dict(self):
        return {
            "query": self.query[:200] + ("..." if len(self.query) > 200 else ""),
            "issues": [asdict(i) for i in self.issues],
            "issue_count": len(self.issues),
            "score": self.score,
        }


# ---------------------------------------------------------------------------
# Rule checkers
# ---------------------------------------------------------------------------

def check_select_star(sql: str) -> Optional[Issue]:
    """Detect SELECT * usage."""
    if re.search(r'\bSELECT\s+\*\s', sql, re.IGNORECASE):
        return Issue(
            severity="warning",
            rule="select-star",
            message="SELECT * transfers unnecessary data and breaks on schema changes.",
            suggestion="List only the columns you need: SELECT col1, col2, ...",
        )
    return None


def check_missing_where(sql: str) -> Optional[Issue]:
    """Detect UPDATE/DELETE without WHERE."""
    upper = sql.upper().strip()
    for keyword in ("UPDATE", "DELETE"):
        if upper.startswith(keyword) and "WHERE" not in upper:
            return Issue(
                severity="critical",
                rule="missing-where",
                message=f"{keyword} without WHERE affects every row in the table.",
                suggestion=f"Add a WHERE clause to restrict the {keyword} scope.",
            )
    return None


def check_cartesian_join(sql: str) -> Optional[Issue]:
    """Detect comma-separated tables without explicit JOIN or WHERE join condition."""
    upper = sql.upper()
    if "SELECT" not in upper:
        return None
    from_match = re.search(r'\bFROM\s+(.+?)(?:\bWHERE\b|\bGROUP\b|\bORDER\b|\bLIMIT\b|\bHAVING\b|;|$)',
                           sql, re.IGNORECASE | re.DOTALL)
    if not from_match:
        return None
    from_clause = from_match.group(1)
    # Skip if explicit JOINs are used
    if re.search(r'\bJOIN\b', from_clause, re.IGNORECASE):
        return None
    # Count comma-separated tables
    tables = [t.strip() for t in from_clause.split(",") if t.strip()]
    if len(tables) > 1 and "WHERE" not in upper:
        return Issue(
            severity="critical",
            rule="cartesian-join",
            message="Multiple tables in FROM without JOIN or WHERE creates a cartesian product.",
            suggestion="Use explicit JOIN syntax with ON conditions.",
        )
    return None


def check_subquery_in_select(sql: str) -> Optional[Issue]:
    """Detect correlated subqueries in SELECT list."""
    select_match = re.search(r'\bSELECT\b(.+?)\bFROM\b', sql, re.IGNORECASE | re.DOTALL)
    if select_match:
        select_clause = select_match.group(1)
        if re.search(r'\(\s*SELECT\b', select_clause, re.IGNORECASE):
            return Issue(
                severity="warning",
                rule="subquery-in-select",
                message="Subquery in SELECT list executes once per row (correlated subquery).",
                suggestion="Rewrite as a LEFT JOIN with aggregation.",
            )
    return None


def check_missing_limit(sql: str) -> Optional[Issue]:
    """Detect unbounded SELECT without LIMIT."""
    upper = sql.upper().strip()
    if not upper.startswith("SELECT"):
        return None
    # Skip if it's a subquery or aggregate-only
    if re.search(r'\bCOUNT\s*\(', upper) and "GROUP BY" not in upper:
        return None
    if "LIMIT" not in upper and "FETCH" not in upper and "TOP " not in upper:
        return Issue(
            severity="info",
            rule="missing-limit",
            message="SELECT without LIMIT may return unbounded rows.",
            suggestion="Add LIMIT to prevent returning excessive data.",
        )
    return None


def check_function_on_column(sql: str) -> Optional[Issue]:
    """Detect function calls on columns in WHERE (non-sargable)."""
    where_match = re.search(r'\bWHERE\b(.+?)(?:\bGROUP\b|\bORDER\b|\bLIMIT\b|\bHAVING\b|;|$)',
                            sql, re.IGNORECASE | re.DOTALL)
    if not where_match:
        return None
    where_clause = where_match.group(1)
    non_sargable = re.search(
        r'\b(YEAR|MONTH|DAY|DATE|UPPER|LOWER|TRIM|CAST|COALESCE|IFNULL|NVL)\s*\(',
        where_clause, re.IGNORECASE
    )
    if non_sargable:
        func = non_sargable.group(1).upper()
        return Issue(
            severity="warning",
            rule="non-sargable",
            message=f"Function {func}() on column in WHERE prevents index usage.",
            suggestion="Rewrite to compare the raw column against transformed constants.",
        )
    return None


def check_leading_wildcard(sql: str) -> Optional[Issue]:
    """Detect LIKE '%...' patterns."""
    if re.search(r"LIKE\s+'%", sql, re.IGNORECASE):
        return Issue(
            severity="warning",
            rule="leading-wildcard",
            message="LIKE with leading wildcard prevents index usage.",
            suggestion="Use full-text search (GIN index, FULLTEXT, FTS5) for substring matching.",
        )
    return None


def check_order_by_rand(sql: str) -> Optional[Issue]:
    """Detect ORDER BY RAND() / RANDOM()."""
    if re.search(r'ORDER\s+BY\s+(RAND|RANDOM)\s*\(\)', sql, re.IGNORECASE):
        return Issue(
            severity="warning",
            rule="order-by-rand",
            message="ORDER BY RAND() scans and sorts the entire table.",
            suggestion="Use application-side random sampling or TABLESAMPLE.",
        )
    return None


def check_union_vs_union_all(sql: str) -> Optional[Issue]:
    """Detect UNION without ALL (unnecessary dedup)."""
    if re.search(r'\bUNION\b(?!\s+ALL\b)', sql, re.IGNORECASE):
        return Issue(
            severity="info",
            rule="union-without-all",
            message="UNION performs deduplication sort; use UNION ALL if duplicates are acceptable.",
            suggestion="Replace UNION with UNION ALL unless you specifically need deduplication.",
        )
    return None


def check_not_in_subquery(sql: str) -> Optional[Issue]:
    """Detect NOT IN (SELECT ...) which is NULL-unsafe."""
    if re.search(r'\bNOT\s+IN\s*\(\s*SELECT\b', sql, re.IGNORECASE):
        return Issue(
            severity="warning",
            rule="not-in-subquery",
            message="NOT IN with subquery returns no rows if any subquery result is NULL.",
            suggestion="Use NOT EXISTS (SELECT 1 ...) instead.",
        )
    return None


ALL_CHECKS = [
    check_select_star,
    check_missing_where,
    check_cartesian_join,
    check_subquery_in_select,
    check_missing_limit,
    check_function_on_column,
    check_leading_wildcard,
    check_order_by_rand,
    check_union_vs_union_all,
    check_not_in_subquery,
]


# ---------------------------------------------------------------------------
# Analysis engine
# ---------------------------------------------------------------------------

def analyze_query(sql: str, dialect: str = "postgres") -> QueryAnalysis:
    """Run all checks against a single SQL query."""
    issues: List[Issue] = []
    for check_fn in ALL_CHECKS:
        issue = check_fn(sql)
        if issue:
            issues.append(issue)

    # Score: start at 100, deduct per severity
    score = 100
    for issue in issues:
        if issue.severity == "critical":
            score -= 25
        elif issue.severity == "warning":
            score -= 10
        else:
            score -= 5
    score = max(0, score)

    return QueryAnalysis(query=sql.strip(), issues=issues, score=score)


def split_queries(text: str) -> List[str]:
    """Split SQL text into individual statements."""
    queries = []
    for stmt in text.split(";"):
        stmt = stmt.strip()
        if stmt and len(stmt) > 5:
            queries.append(stmt + ";")
    return queries


# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------

SEVERITY_ICONS = {"critical": "[CRITICAL]", "warning": "[WARNING]", "info": "[INFO]"}


def format_text(analyses: List[QueryAnalysis]) -> str:
    """Format analysis results as human-readable text."""
    lines = []
    for i, analysis in enumerate(analyses, 1):
        lines.append(f"{'='*60}")
        lines.append(f"Query {i} (Score: {analysis.score}/100)")
        lines.append(f"  {analysis.query[:120]}{'...' if len(analysis.query) > 120 else ''}")
        lines.append("")
        if not analysis.issues:
            lines.append("  No issues detected.")
        for issue in analysis.issues:
            icon = SEVERITY_ICONS.get(issue.severity, "")
            lines.append(f"  {icon} {issue.rule}: {issue.message}")
            lines.append(f"    -> {issue.suggestion}")
        lines.append("")
    return "\n".join(lines)


def format_json(analyses: List[QueryAnalysis]) -> str:
    """Format analysis results as JSON."""
    return json.dumps(
        {"analyses": [a.to_dict() for a in analyses], "total_queries": len(analyses)},
        indent=2,
    )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Analyze SQL queries for common performance issues.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s --query "SELECT * FROM users"
  %(prog)s --query queries.sql --dialect mysql
  %(prog)s --query "DELETE FROM orders" --json
        """,
    )
    parser.add_argument(
        "--query", required=True,
        help="SQL query string or path to a .sql file",
    )
    parser.add_argument(
        "--dialect", choices=["postgres", "mysql", "sqlite", "sqlserver"],
        default="postgres", help="SQL dialect (default: postgres)",
    )
    parser.add_argument(
        "--json", action="store_true", dest="json_output",
        help="Output results as JSON",
    )
    args = parser.parse_args()

    # Determine if query is a file path or inline SQL
    sql_text = args.query
    if os.path.isfile(args.query):
        with open(args.query, "r") as f:
            sql_text = f.read()

    queries = split_queries(sql_text)
    if not queries:
        # Treat the whole input as a single query
        queries = [sql_text.strip()]

    analyses = [analyze_query(q, args.dialect) for q in queries]

    if args.json_output:
        print(format_json(analyses))
    else:
        print(format_text(analyses))


if __name__ == "__main__":
    main()