antigravity-skills-reference/skills/last30days/scripts/lib/reddit_enrich.py

"""Reddit thread enrichment with real engagement metrics."""

import re
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

from . import http, dates


def extract_reddit_path(url: str) -> Optional[str]:
    """Extract the path from a Reddit URL.

    Args:
        url: Reddit URL

    Returns:
        Path component or None
    """
    try:
        parsed = urlparse(url)
        if "reddit.com" not in parsed.netloc:
            return None
        return parsed.path
    except:
        return None


def fetch_thread_data(url: str, mock_data: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
    """Fetch Reddit thread JSON data.

    Args:
        url: Reddit thread URL
        mock_data: Mock data for testing

    Returns:
        Thread data dict or None on failure
    """
    if mock_data is not None:
        return mock_data

    path = extract_reddit_path(url)
    if not path:
        return None

    try:
        data = http.get_reddit_json(path)
        return data
    except http.HTTPError:
        return None


def parse_thread_data(data: Any) -> Dict[str, Any]:
    """Parse Reddit thread JSON into structured data.

    Args:
        data: Raw Reddit JSON response

    Returns:
        Dict with submission and comments data
    """
    result = {
        "submission": None,
        "comments": [],
    }

    if not isinstance(data, list) or len(data) < 1:
        return result

    # First element is submission listing
    submission_listing = data[0]
    if isinstance(submission_listing, dict):
        children = submission_listing.get("data", {}).get("children", [])
        if children:
            sub_data = children[0].get("data", {})
            result["submission"] = {
                "score": sub_data.get("score"),
                "num_comments": sub_data.get("num_comments"),
                "upvote_ratio": sub_data.get("upvote_ratio"),
                "created_utc": sub_data.get("created_utc"),
                "permalink": sub_data.get("permalink"),
                "title": sub_data.get("title"),
                "selftext": sub_data.get("selftext", "")[:500],  # Truncate
            }

    # Second element is comments listing
    if len(data) >= 2:
        comments_listing = data[1]
        if isinstance(comments_listing, dict):
            children = comments_listing.get("data", {}).get("children", [])
            for child in children:
                if child.get("kind") != "t1":  # t1 = comment
                    continue
                c_data = child.get("data", {})
                if not c_data.get("body"):
                    continue

                comment = {
                    "score": c_data.get("score", 0),
                    "created_utc": c_data.get("created_utc"),
                    "author": c_data.get("author", "[deleted]"),
                    "body": c_data.get("body", "")[:300],  # Truncate
                    "permalink": c_data.get("permalink"),
                }
                result["comments"].append(comment)

    return result


def get_top_comments(comments: List[Dict], limit: int = 10) -> List[Dict[str, Any]]:
    """Get top comments sorted by score.

    Args:
        comments: List of comment dicts
        limit: Maximum number to return

    Returns:
        Top comments sorted by score
    """
    # Filter out deleted/removed
    valid = [c for c in comments if c.get("author") not in ("[deleted]", "[removed]")]

    # Sort by score descending
    sorted_comments = sorted(valid, key=lambda c: c.get("score", 0), reverse=True)

    return sorted_comments[:limit]


def extract_comment_insights(comments: List[Dict], limit: int = 7) -> List[str]:
    """Extract key insights from top comments.

    Uses simple heuristics to identify valuable comments:
    - Has substantive text
    - Contains actionable information
    - Not just agreement/disagreement

    Args:
        comments: Top comments
        limit: Max insights to extract

    Returns:
        List of insight strings
    """
    insights = []

    for comment in comments[:limit * 2]:  # Look at more comments than we need
        body = comment.get("body", "").strip()
        if not body or len(body) < 30:
            continue

        # Skip low-value patterns
        skip_patterns = [
            r'^(this|same|agreed|exactly|yep|nope|yes|no|thanks|thank you)\.?$',
            r'^lol|lmao|haha',
            r'^\[deleted\]',
            r'^\[removed\]',
        ]
        if any(re.match(p, body.lower()) for p in skip_patterns):
            continue

        # Truncate to first meaningful sentence or ~150 chars
        insight = body[:150]
        if len(body) > 150:
            # Try to find a sentence boundary
            for i, char in enumerate(insight):
                if char in '.!?' and i > 50:
                    insight = insight[:i+1]
                    break
            else:
                insight = insight.rstrip() + "..."

        insights.append(insight)
        if len(insights) >= limit:
            break

    return insights


def enrich_reddit_item(
    item: Dict[str, Any],
    mock_thread_data: Optional[Dict] = None,
) -> Dict[str, Any]:
    """Enrich a Reddit item with real engagement data.

    Args:
        item: Reddit item dict
        mock_thread_data: Mock data for testing

    Returns:
        Enriched item dict
    """
    url = item.get("url", "")

    # Fetch thread data
    thread_data = fetch_thread_data(url, mock_thread_data)
    if not thread_data:
        return item

    parsed = parse_thread_data(thread_data)
    submission = parsed.get("submission")
    comments = parsed.get("comments", [])

    # Update engagement metrics
    if submission:
        item["engagement"] = {
            "score": submission.get("score"),
            "num_comments": submission.get("num_comments"),
            "upvote_ratio": submission.get("upvote_ratio"),
        }

        # Update date from actual data
        created_utc = submission.get("created_utc")
        if created_utc:
            item["date"] = dates.timestamp_to_date(created_utc)

    # Get top comments
    top_comments = get_top_comments(comments)
    item["top_comments"] = []
    for c in top_comments:
        permalink = c.get("permalink", "")
        comment_url = f"https://reddit.com{permalink}" if permalink else ""
        item["top_comments"].append({
            "score": c.get("score", 0),
            "date": dates.timestamp_to_date(c.get("created_utc")),
            "author": c.get("author", ""),
            "excerpt": c.get("body", "")[:200],
            "url": comment_url,
        })

    # Extract insights
    item["comment_insights"] = extract_comment_insights(top_comments)

    return item