402 lines
11 KiB
Python
402 lines
11 KiB
Python
"""WebSearch module for last30days skill.
|
|
|
|
NOTE: WebSearch uses Claude's built-in WebSearch tool, which runs INSIDE Claude Code.
|
|
Unlike Reddit/X which use external APIs, WebSearch results are obtained by Claude
|
|
directly and passed to this module for normalization and scoring.
|
|
|
|
The typical flow is:
|
|
1. Claude invokes WebSearch tool with the topic
|
|
2. Claude passes results to parse_websearch_results()
|
|
3. Results are normalized into WebSearchItem objects
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
from urllib.parse import urlparse
|
|
|
|
from . import schema
|
|
|
|
|
|
# Month name mappings for date parsing
|
|
MONTH_MAP = {
|
|
"jan": 1, "january": 1,
|
|
"feb": 2, "february": 2,
|
|
"mar": 3, "march": 3,
|
|
"apr": 4, "april": 4,
|
|
"may": 5,
|
|
"jun": 6, "june": 6,
|
|
"jul": 7, "july": 7,
|
|
"aug": 8, "august": 8,
|
|
"sep": 9, "sept": 9, "september": 9,
|
|
"oct": 10, "october": 10,
|
|
"nov": 11, "november": 11,
|
|
"dec": 12, "december": 12,
|
|
}
|
|
|
|
|
|
def extract_date_from_url(url: str) -> Optional[str]:
|
|
"""Try to extract a date from URL path.
|
|
|
|
Many sites embed dates in URLs like:
|
|
- /2026/01/24/article-title
|
|
- /2026-01-24/article
|
|
- /blog/20260124/title
|
|
|
|
Args:
|
|
url: URL to parse
|
|
|
|
Returns:
|
|
Date string in YYYY-MM-DD format, or None
|
|
"""
|
|
# Pattern 1: /YYYY/MM/DD/ (most common)
|
|
match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url)
|
|
if match:
|
|
year, month, day = match.groups()
|
|
if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month}-{day}"
|
|
|
|
# Pattern 2: /YYYY-MM-DD/ or /YYYY-MM-DD-
|
|
match = re.search(r'/(\d{4})-(\d{2})-(\d{2})[-/]', url)
|
|
if match:
|
|
year, month, day = match.groups()
|
|
if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month}-{day}"
|
|
|
|
# Pattern 3: /YYYYMMDD/ (compact)
|
|
match = re.search(r'/(\d{4})(\d{2})(\d{2})/', url)
|
|
if match:
|
|
year, month, day = match.groups()
|
|
if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month}-{day}"
|
|
|
|
return None
|
|
|
|
|
|
def extract_date_from_snippet(text: str) -> Optional[str]:
|
|
"""Try to extract a date from text snippet or title.
|
|
|
|
Looks for patterns like:
|
|
- January 24, 2026 or Jan 24, 2026
|
|
- 24 January 2026
|
|
- 2026-01-24
|
|
- "3 days ago", "yesterday", "last week"
|
|
|
|
Args:
|
|
text: Text to parse
|
|
|
|
Returns:
|
|
Date string in YYYY-MM-DD format, or None
|
|
"""
|
|
if not text:
|
|
return None
|
|
|
|
text_lower = text.lower()
|
|
|
|
# Pattern 1: Month DD, YYYY (e.g., "January 24, 2026")
|
|
match = re.search(
|
|
r'\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|'
|
|
r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)'
|
|
r'\s+(\d{1,2})(?:st|nd|rd|th)?,?\s*(\d{4})\b',
|
|
text_lower
|
|
)
|
|
if match:
|
|
month_str, day, year = match.groups()
|
|
month = MONTH_MAP.get(month_str[:3])
|
|
if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month:02d}-{int(day):02d}"
|
|
|
|
# Pattern 2: DD Month YYYY (e.g., "24 January 2026")
|
|
match = re.search(
|
|
r'\b(\d{1,2})(?:st|nd|rd|th)?\s+'
|
|
r'(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|'
|
|
r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)'
|
|
r'\s+(\d{4})\b',
|
|
text_lower
|
|
)
|
|
if match:
|
|
day, month_str, year = match.groups()
|
|
month = MONTH_MAP.get(month_str[:3])
|
|
if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month:02d}-{int(day):02d}"
|
|
|
|
# Pattern 3: YYYY-MM-DD (ISO format)
|
|
match = re.search(r'\b(\d{4})-(\d{2})-(\d{2})\b', text)
|
|
if match:
|
|
year, month, day = match.groups()
|
|
if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31:
|
|
return f"{year}-{month}-{day}"
|
|
|
|
# Pattern 4: Relative dates ("3 days ago", "yesterday", etc.)
|
|
today = datetime.now()
|
|
|
|
if "yesterday" in text_lower:
|
|
date = today - timedelta(days=1)
|
|
return date.strftime("%Y-%m-%d")
|
|
|
|
if "today" in text_lower:
|
|
return today.strftime("%Y-%m-%d")
|
|
|
|
# "N days ago"
|
|
match = re.search(r'\b(\d+)\s*days?\s*ago\b', text_lower)
|
|
if match:
|
|
days = int(match.group(1))
|
|
if days <= 60: # Reasonable range
|
|
date = today - timedelta(days=days)
|
|
return date.strftime("%Y-%m-%d")
|
|
|
|
# "N hours ago" -> today
|
|
match = re.search(r'\b(\d+)\s*hours?\s*ago\b', text_lower)
|
|
if match:
|
|
return today.strftime("%Y-%m-%d")
|
|
|
|
# "last week" -> ~7 days ago
|
|
if "last week" in text_lower:
|
|
date = today - timedelta(days=7)
|
|
return date.strftime("%Y-%m-%d")
|
|
|
|
# "this week" -> ~3 days ago (middle of week)
|
|
if "this week" in text_lower:
|
|
date = today - timedelta(days=3)
|
|
return date.strftime("%Y-%m-%d")
|
|
|
|
return None
|
|
|
|
|
|
def extract_date_signals(
|
|
url: str,
|
|
snippet: str,
|
|
title: str,
|
|
) -> Tuple[Optional[str], str]:
|
|
"""Extract date from any available signal.
|
|
|
|
Tries URL first (most reliable), then snippet, then title.
|
|
|
|
Args:
|
|
url: Page URL
|
|
snippet: Page snippet/description
|
|
title: Page title
|
|
|
|
Returns:
|
|
Tuple of (date_string, confidence)
|
|
- date from URL: 'high' confidence
|
|
- date from snippet/title: 'med' confidence
|
|
- no date found: None, 'low' confidence
|
|
"""
|
|
# Try URL first (most reliable)
|
|
url_date = extract_date_from_url(url)
|
|
if url_date:
|
|
return url_date, "high"
|
|
|
|
# Try snippet
|
|
snippet_date = extract_date_from_snippet(snippet)
|
|
if snippet_date:
|
|
return snippet_date, "med"
|
|
|
|
# Try title
|
|
title_date = extract_date_from_snippet(title)
|
|
if title_date:
|
|
return title_date, "med"
|
|
|
|
return None, "low"
|
|
|
|
|
|
# Domains to exclude (Reddit and X are handled separately)
|
|
EXCLUDED_DOMAINS = {
|
|
"reddit.com",
|
|
"www.reddit.com",
|
|
"old.reddit.com",
|
|
"twitter.com",
|
|
"www.twitter.com",
|
|
"x.com",
|
|
"www.x.com",
|
|
"mobile.twitter.com",
|
|
}
|
|
|
|
|
|
def extract_domain(url: str) -> str:
|
|
"""Extract the domain from a URL.
|
|
|
|
Args:
|
|
url: Full URL
|
|
|
|
Returns:
|
|
Domain string (e.g., "medium.com")
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
# Remove www. prefix for cleaner display
|
|
if domain.startswith("www."):
|
|
domain = domain[4:]
|
|
return domain
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def is_excluded_domain(url: str) -> bool:
|
|
"""Check if URL is from an excluded domain (Reddit/X).
|
|
|
|
Args:
|
|
url: URL to check
|
|
|
|
Returns:
|
|
True if URL should be excluded
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
return domain in EXCLUDED_DOMAINS
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def parse_websearch_results(
|
|
results: List[Dict[str, Any]],
|
|
topic: str,
|
|
from_date: str = "",
|
|
to_date: str = "",
|
|
) -> List[Dict[str, Any]]:
|
|
"""Parse WebSearch results into normalized format.
|
|
|
|
This function expects results from Claude's WebSearch tool.
|
|
Each result should have: title, url, snippet, and optionally date/relevance.
|
|
|
|
Uses "Date Detective" approach:
|
|
1. Extract dates from URLs (high confidence)
|
|
2. Extract dates from snippets/titles (med confidence)
|
|
3. Hard filter: exclude items with verified old dates
|
|
4. Keep items with no date signals (with low confidence penalty)
|
|
|
|
Args:
|
|
results: List of WebSearch result dicts
|
|
topic: Original search topic (for context)
|
|
from_date: Start date for filtering (YYYY-MM-DD)
|
|
to_date: End date for filtering (YYYY-MM-DD)
|
|
|
|
Returns:
|
|
List of normalized item dicts ready for WebSearchItem creation
|
|
"""
|
|
items = []
|
|
|
|
for i, result in enumerate(results):
|
|
if not isinstance(result, dict):
|
|
continue
|
|
|
|
url = result.get("url", "")
|
|
if not url:
|
|
continue
|
|
|
|
# Skip Reddit/X URLs (handled separately)
|
|
if is_excluded_domain(url):
|
|
continue
|
|
|
|
title = str(result.get("title", "")).strip()
|
|
snippet = str(result.get("snippet", result.get("description", ""))).strip()
|
|
|
|
if not title and not snippet:
|
|
continue
|
|
|
|
# Use Date Detective to extract date signals
|
|
date = result.get("date") # Use provided date if available
|
|
date_confidence = "low"
|
|
|
|
if date and re.match(r'^\d{4}-\d{2}-\d{2}$', str(date)):
|
|
# Provided date is valid
|
|
date_confidence = "med"
|
|
else:
|
|
# Try to extract date from URL/snippet/title
|
|
extracted_date, confidence = extract_date_signals(url, snippet, title)
|
|
if extracted_date:
|
|
date = extracted_date
|
|
date_confidence = confidence
|
|
|
|
# Hard filter: if we found a date and it's too old, skip
|
|
if date and from_date and date < from_date:
|
|
continue # DROP - verified old content
|
|
|
|
# Hard filter: if date is in the future, skip (parsing error)
|
|
if date and to_date and date > to_date:
|
|
continue # DROP - future date
|
|
|
|
# Get relevance if provided, default to 0.5
|
|
relevance = result.get("relevance", 0.5)
|
|
try:
|
|
relevance = min(1.0, max(0.0, float(relevance)))
|
|
except (TypeError, ValueError):
|
|
relevance = 0.5
|
|
|
|
item = {
|
|
"id": f"W{i+1}",
|
|
"title": title[:200], # Truncate long titles
|
|
"url": url,
|
|
"source_domain": extract_domain(url),
|
|
"snippet": snippet[:500], # Truncate long snippets
|
|
"date": date,
|
|
"date_confidence": date_confidence,
|
|
"relevance": relevance,
|
|
"why_relevant": str(result.get("why_relevant", "")).strip(),
|
|
}
|
|
|
|
items.append(item)
|
|
|
|
return items
|
|
|
|
|
|
def normalize_websearch_items(
|
|
items: List[Dict[str, Any]],
|
|
from_date: str,
|
|
to_date: str,
|
|
) -> List[schema.WebSearchItem]:
|
|
"""Convert parsed dicts to WebSearchItem objects.
|
|
|
|
Args:
|
|
items: List of parsed item dicts
|
|
from_date: Start of date range (YYYY-MM-DD)
|
|
to_date: End of date range (YYYY-MM-DD)
|
|
|
|
Returns:
|
|
List of WebSearchItem objects
|
|
"""
|
|
result = []
|
|
|
|
for item in items:
|
|
web_item = schema.WebSearchItem(
|
|
id=item["id"],
|
|
title=item["title"],
|
|
url=item["url"],
|
|
source_domain=item["source_domain"],
|
|
snippet=item["snippet"],
|
|
date=item.get("date"),
|
|
date_confidence=item.get("date_confidence", "low"),
|
|
relevance=item.get("relevance", 0.5),
|
|
why_relevant=item.get("why_relevant", ""),
|
|
)
|
|
result.append(web_item)
|
|
|
|
return result
|
|
|
|
|
|
def dedupe_websearch(items: List[schema.WebSearchItem]) -> List[schema.WebSearchItem]:
|
|
"""Remove duplicate WebSearch items.
|
|
|
|
Deduplication is based on URL.
|
|
|
|
Args:
|
|
items: List of WebSearchItem objects
|
|
|
|
Returns:
|
|
Deduplicated list
|
|
"""
|
|
seen_urls = set()
|
|
result = []
|
|
|
|
for item in items:
|
|
# Normalize URL for comparison
|
|
url_key = item.url.lower().rstrip("/")
|
|
if url_key not in seen_urls:
|
|
seen_urls.add(url_key)
|
|
result.append(item)
|
|
|
|
return result
|