233 lines
6.6 KiB
Python
233 lines
6.6 KiB
Python
"""Reddit thread enrichment with real engagement metrics."""
|
|
|
|
import re
|
|
from typing import Any, Dict, List, Optional
|
|
from urllib.parse import urlparse
|
|
|
|
from . import http, dates
|
|
|
|
|
|
def extract_reddit_path(url: str) -> Optional[str]:
|
|
"""Extract the path from a Reddit URL.
|
|
|
|
Args:
|
|
url: Reddit URL
|
|
|
|
Returns:
|
|
Path component or None
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
if "reddit.com" not in parsed.netloc:
|
|
return None
|
|
return parsed.path
|
|
except:
|
|
return None
|
|
|
|
|
|
def fetch_thread_data(url: str, mock_data: Optional[Dict] = None) -> Optional[Dict[str, Any]]:
|
|
"""Fetch Reddit thread JSON data.
|
|
|
|
Args:
|
|
url: Reddit thread URL
|
|
mock_data: Mock data for testing
|
|
|
|
Returns:
|
|
Thread data dict or None on failure
|
|
"""
|
|
if mock_data is not None:
|
|
return mock_data
|
|
|
|
path = extract_reddit_path(url)
|
|
if not path:
|
|
return None
|
|
|
|
try:
|
|
data = http.get_reddit_json(path)
|
|
return data
|
|
except http.HTTPError:
|
|
return None
|
|
|
|
|
|
def parse_thread_data(data: Any) -> Dict[str, Any]:
|
|
"""Parse Reddit thread JSON into structured data.
|
|
|
|
Args:
|
|
data: Raw Reddit JSON response
|
|
|
|
Returns:
|
|
Dict with submission and comments data
|
|
"""
|
|
result = {
|
|
"submission": None,
|
|
"comments": [],
|
|
}
|
|
|
|
if not isinstance(data, list) or len(data) < 1:
|
|
return result
|
|
|
|
# First element is submission listing
|
|
submission_listing = data[0]
|
|
if isinstance(submission_listing, dict):
|
|
children = submission_listing.get("data", {}).get("children", [])
|
|
if children:
|
|
sub_data = children[0].get("data", {})
|
|
result["submission"] = {
|
|
"score": sub_data.get("score"),
|
|
"num_comments": sub_data.get("num_comments"),
|
|
"upvote_ratio": sub_data.get("upvote_ratio"),
|
|
"created_utc": sub_data.get("created_utc"),
|
|
"permalink": sub_data.get("permalink"),
|
|
"title": sub_data.get("title"),
|
|
"selftext": sub_data.get("selftext", "")[:500], # Truncate
|
|
}
|
|
|
|
# Second element is comments listing
|
|
if len(data) >= 2:
|
|
comments_listing = data[1]
|
|
if isinstance(comments_listing, dict):
|
|
children = comments_listing.get("data", {}).get("children", [])
|
|
for child in children:
|
|
if child.get("kind") != "t1": # t1 = comment
|
|
continue
|
|
c_data = child.get("data", {})
|
|
if not c_data.get("body"):
|
|
continue
|
|
|
|
comment = {
|
|
"score": c_data.get("score", 0),
|
|
"created_utc": c_data.get("created_utc"),
|
|
"author": c_data.get("author", "[deleted]"),
|
|
"body": c_data.get("body", "")[:300], # Truncate
|
|
"permalink": c_data.get("permalink"),
|
|
}
|
|
result["comments"].append(comment)
|
|
|
|
return result
|
|
|
|
|
|
def get_top_comments(comments: List[Dict], limit: int = 10) -> List[Dict[str, Any]]:
|
|
"""Get top comments sorted by score.
|
|
|
|
Args:
|
|
comments: List of comment dicts
|
|
limit: Maximum number to return
|
|
|
|
Returns:
|
|
Top comments sorted by score
|
|
"""
|
|
# Filter out deleted/removed
|
|
valid = [c for c in comments if c.get("author") not in ("[deleted]", "[removed]")]
|
|
|
|
# Sort by score descending
|
|
sorted_comments = sorted(valid, key=lambda c: c.get("score", 0), reverse=True)
|
|
|
|
return sorted_comments[:limit]
|
|
|
|
|
|
def extract_comment_insights(comments: List[Dict], limit: int = 7) -> List[str]:
|
|
"""Extract key insights from top comments.
|
|
|
|
Uses simple heuristics to identify valuable comments:
|
|
- Has substantive text
|
|
- Contains actionable information
|
|
- Not just agreement/disagreement
|
|
|
|
Args:
|
|
comments: Top comments
|
|
limit: Max insights to extract
|
|
|
|
Returns:
|
|
List of insight strings
|
|
"""
|
|
insights = []
|
|
|
|
for comment in comments[:limit * 2]: # Look at more comments than we need
|
|
body = comment.get("body", "").strip()
|
|
if not body or len(body) < 30:
|
|
continue
|
|
|
|
# Skip low-value patterns
|
|
skip_patterns = [
|
|
r'^(this|same|agreed|exactly|yep|nope|yes|no|thanks|thank you)\.?$',
|
|
r'^lol|lmao|haha',
|
|
r'^\[deleted\]',
|
|
r'^\[removed\]',
|
|
]
|
|
if any(re.match(p, body.lower()) for p in skip_patterns):
|
|
continue
|
|
|
|
# Truncate to first meaningful sentence or ~150 chars
|
|
insight = body[:150]
|
|
if len(body) > 150:
|
|
# Try to find a sentence boundary
|
|
for i, char in enumerate(insight):
|
|
if char in '.!?' and i > 50:
|
|
insight = insight[:i+1]
|
|
break
|
|
else:
|
|
insight = insight.rstrip() + "..."
|
|
|
|
insights.append(insight)
|
|
if len(insights) >= limit:
|
|
break
|
|
|
|
return insights
|
|
|
|
|
|
def enrich_reddit_item(
|
|
item: Dict[str, Any],
|
|
mock_thread_data: Optional[Dict] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Enrich a Reddit item with real engagement data.
|
|
|
|
Args:
|
|
item: Reddit item dict
|
|
mock_thread_data: Mock data for testing
|
|
|
|
Returns:
|
|
Enriched item dict
|
|
"""
|
|
url = item.get("url", "")
|
|
|
|
# Fetch thread data
|
|
thread_data = fetch_thread_data(url, mock_thread_data)
|
|
if not thread_data:
|
|
return item
|
|
|
|
parsed = parse_thread_data(thread_data)
|
|
submission = parsed.get("submission")
|
|
comments = parsed.get("comments", [])
|
|
|
|
# Update engagement metrics
|
|
if submission:
|
|
item["engagement"] = {
|
|
"score": submission.get("score"),
|
|
"num_comments": submission.get("num_comments"),
|
|
"upvote_ratio": submission.get("upvote_ratio"),
|
|
}
|
|
|
|
# Update date from actual data
|
|
created_utc = submission.get("created_utc")
|
|
if created_utc:
|
|
item["date"] = dates.timestamp_to_date(created_utc)
|
|
|
|
# Get top comments
|
|
top_comments = get_top_comments(comments)
|
|
item["top_comments"] = []
|
|
for c in top_comments:
|
|
permalink = c.get("permalink", "")
|
|
comment_url = f"https://reddit.com{permalink}" if permalink else ""
|
|
item["top_comments"].append({
|
|
"score": c.get("score", 0),
|
|
"date": dates.timestamp_to_date(c.get("created_utc")),
|
|
"author": c.get("author", ""),
|
|
"excerpt": c.get("body", "")[:200],
|
|
"url": comment_url,
|
|
})
|
|
|
|
# Extract insights
|
|
item["comment_insights"] = extract_comment_insights(top_comments)
|
|
|
|
return item
|