"""Reddit thread enrichment with real engagement metrics.""" import re from typing import Any, Dict, List, Optional from urllib.parse import urlparse from . import http, dates def extract_reddit_path(url: str) -> Optional[str]: """Extract the path from a Reddit URL. Args: url: Reddit URL Returns: Path component or None """ try: parsed = urlparse(url) if "reddit.com" not in parsed.netloc: return None return parsed.path except: return None def fetch_thread_data(url: str, mock_data: Optional[Dict] = None) -> Optional[Dict[str, Any]]: """Fetch Reddit thread JSON data. Args: url: Reddit thread URL mock_data: Mock data for testing Returns: Thread data dict or None on failure """ if mock_data is not None: return mock_data path = extract_reddit_path(url) if not path: return None try: data = http.get_reddit_json(path) return data except http.HTTPError: return None def parse_thread_data(data: Any) -> Dict[str, Any]: """Parse Reddit thread JSON into structured data. Args: data: Raw Reddit JSON response Returns: Dict with submission and comments data """ result = { "submission": None, "comments": [], } if not isinstance(data, list) or len(data) < 1: return result # First element is submission listing submission_listing = data[0] if isinstance(submission_listing, dict): children = submission_listing.get("data", {}).get("children", []) if children: sub_data = children[0].get("data", {}) result["submission"] = { "score": sub_data.get("score"), "num_comments": sub_data.get("num_comments"), "upvote_ratio": sub_data.get("upvote_ratio"), "created_utc": sub_data.get("created_utc"), "permalink": sub_data.get("permalink"), "title": sub_data.get("title"), "selftext": sub_data.get("selftext", "")[:500], # Truncate } # Second element is comments listing if len(data) >= 2: comments_listing = data[1] if isinstance(comments_listing, dict): children = comments_listing.get("data", {}).get("children", []) for child in children: if child.get("kind") != "t1": # t1 = comment continue c_data = child.get("data", {}) if not c_data.get("body"): continue comment = { "score": c_data.get("score", 0), "created_utc": c_data.get("created_utc"), "author": c_data.get("author", "[deleted]"), "body": c_data.get("body", "")[:300], # Truncate "permalink": c_data.get("permalink"), } result["comments"].append(comment) return result def get_top_comments(comments: List[Dict], limit: int = 10) -> List[Dict[str, Any]]: """Get top comments sorted by score. Args: comments: List of comment dicts limit: Maximum number to return Returns: Top comments sorted by score """ # Filter out deleted/removed valid = [c for c in comments if c.get("author") not in ("[deleted]", "[removed]")] # Sort by score descending sorted_comments = sorted(valid, key=lambda c: c.get("score", 0), reverse=True) return sorted_comments[:limit] def extract_comment_insights(comments: List[Dict], limit: int = 7) -> List[str]: """Extract key insights from top comments. Uses simple heuristics to identify valuable comments: - Has substantive text - Contains actionable information - Not just agreement/disagreement Args: comments: Top comments limit: Max insights to extract Returns: List of insight strings """ insights = [] for comment in comments[:limit * 2]: # Look at more comments than we need body = comment.get("body", "").strip() if not body or len(body) < 30: continue # Skip low-value patterns skip_patterns = [ r'^(this|same|agreed|exactly|yep|nope|yes|no|thanks|thank you)\.?$', r'^lol|lmao|haha', r'^\[deleted\]', r'^\[removed\]', ] if any(re.match(p, body.lower()) for p in skip_patterns): continue # Truncate to first meaningful sentence or ~150 chars insight = body[:150] if len(body) > 150: # Try to find a sentence boundary for i, char in enumerate(insight): if char in '.!?' and i > 50: insight = insight[:i+1] break else: insight = insight.rstrip() + "..." insights.append(insight) if len(insights) >= limit: break return insights def enrich_reddit_item( item: Dict[str, Any], mock_thread_data: Optional[Dict] = None, ) -> Dict[str, Any]: """Enrich a Reddit item with real engagement data. Args: item: Reddit item dict mock_thread_data: Mock data for testing Returns: Enriched item dict """ url = item.get("url", "") # Fetch thread data thread_data = fetch_thread_data(url, mock_thread_data) if not thread_data: return item parsed = parse_thread_data(thread_data) submission = parsed.get("submission") comments = parsed.get("comments", []) # Update engagement metrics if submission: item["engagement"] = { "score": submission.get("score"), "num_comments": submission.get("num_comments"), "upvote_ratio": submission.get("upvote_ratio"), } # Update date from actual data created_utc = submission.get("created_utc") if created_utc: item["date"] = dates.timestamp_to_date(created_utc) # Get top comments top_comments = get_top_comments(comments) item["top_comments"] = [] for c in top_comments: permalink = c.get("permalink", "") comment_url = f"https://reddit.com{permalink}" if permalink else "" item["top_comments"].append({ "score": c.get("score", 0), "date": dates.timestamp_to_date(c.get("created_utc")), "author": c.get("author", ""), "excerpt": c.get("body", "")[:200], "url": comment_url, }) # Extract insights item["comment_insights"] = extract_comment_insights(top_comments) return item