"""WebSearch module for last30days skill. NOTE: WebSearch uses Claude's built-in WebSearch tool, which runs INSIDE Claude Code. Unlike Reddit/X which use external APIs, WebSearch results are obtained by Claude directly and passed to this module for normalization and scoring. The typical flow is: 1. Claude invokes WebSearch tool with the topic 2. Claude passes results to parse_websearch_results() 3. Results are normalized into WebSearchItem objects """ import re from datetime import datetime, timedelta from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse from . import schema # Month name mappings for date parsing MONTH_MAP = { "jan": 1, "january": 1, "feb": 2, "february": 2, "mar": 3, "march": 3, "apr": 4, "april": 4, "may": 5, "jun": 6, "june": 6, "jul": 7, "july": 7, "aug": 8, "august": 8, "sep": 9, "sept": 9, "september": 9, "oct": 10, "october": 10, "nov": 11, "november": 11, "dec": 12, "december": 12, } def extract_date_from_url(url: str) -> Optional[str]: """Try to extract a date from URL path. Many sites embed dates in URLs like: - /2026/01/24/article-title - /2026-01-24/article - /blog/20260124/title Args: url: URL to parse Returns: Date string in YYYY-MM-DD format, or None """ # Pattern 1: /YYYY/MM/DD/ (most common) match = re.search(r'/(\d{4})/(\d{2})/(\d{2})/', url) if match: year, month, day = match.groups() if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31: return f"{year}-{month}-{day}" # Pattern 2: /YYYY-MM-DD/ or /YYYY-MM-DD- match = re.search(r'/(\d{4})-(\d{2})-(\d{2})[-/]', url) if match: year, month, day = match.groups() if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31: return f"{year}-{month}-{day}" # Pattern 3: /YYYYMMDD/ (compact) match = re.search(r'/(\d{4})(\d{2})(\d{2})/', url) if match: year, month, day = match.groups() if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31: return f"{year}-{month}-{day}" return None def extract_date_from_snippet(text: str) -> Optional[str]: """Try to extract a date from text snippet or title. Looks for patterns like: - January 24, 2026 or Jan 24, 2026 - 24 January 2026 - 2026-01-24 - "3 days ago", "yesterday", "last week" Args: text: Text to parse Returns: Date string in YYYY-MM-DD format, or None """ if not text: return None text_lower = text.lower() # Pattern 1: Month DD, YYYY (e.g., "January 24, 2026") match = re.search( r'\b(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|' r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)' r'\s+(\d{1,2})(?:st|nd|rd|th)?,?\s*(\d{4})\b', text_lower ) if match: month_str, day, year = match.groups() month = MONTH_MAP.get(month_str[:3]) if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31: return f"{year}-{month:02d}-{int(day):02d}" # Pattern 2: DD Month YYYY (e.g., "24 January 2026") match = re.search( r'\b(\d{1,2})(?:st|nd|rd|th)?\s+' r'(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|' r'jul(?:y)?|aug(?:ust)?|sep(?:t(?:ember)?)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)' r'\s+(\d{4})\b', text_lower ) if match: day, month_str, year = match.groups() month = MONTH_MAP.get(month_str[:3]) if month and 2020 <= int(year) <= 2030 and 1 <= int(day) <= 31: return f"{year}-{month:02d}-{int(day):02d}" # Pattern 3: YYYY-MM-DD (ISO format) match = re.search(r'\b(\d{4})-(\d{2})-(\d{2})\b', text) if match: year, month, day = match.groups() if 2020 <= int(year) <= 2030 and 1 <= int(month) <= 12 and 1 <= int(day) <= 31: return f"{year}-{month}-{day}" # Pattern 4: Relative dates ("3 days ago", "yesterday", etc.) today = datetime.now() if "yesterday" in text_lower: date = today - timedelta(days=1) return date.strftime("%Y-%m-%d") if "today" in text_lower: return today.strftime("%Y-%m-%d") # "N days ago" match = re.search(r'\b(\d+)\s*days?\s*ago\b', text_lower) if match: days = int(match.group(1)) if days <= 60: # Reasonable range date = today - timedelta(days=days) return date.strftime("%Y-%m-%d") # "N hours ago" -> today match = re.search(r'\b(\d+)\s*hours?\s*ago\b', text_lower) if match: return today.strftime("%Y-%m-%d") # "last week" -> ~7 days ago if "last week" in text_lower: date = today - timedelta(days=7) return date.strftime("%Y-%m-%d") # "this week" -> ~3 days ago (middle of week) if "this week" in text_lower: date = today - timedelta(days=3) return date.strftime("%Y-%m-%d") return None def extract_date_signals( url: str, snippet: str, title: str, ) -> Tuple[Optional[str], str]: """Extract date from any available signal. Tries URL first (most reliable), then snippet, then title. Args: url: Page URL snippet: Page snippet/description title: Page title Returns: Tuple of (date_string, confidence) - date from URL: 'high' confidence - date from snippet/title: 'med' confidence - no date found: None, 'low' confidence """ # Try URL first (most reliable) url_date = extract_date_from_url(url) if url_date: return url_date, "high" # Try snippet snippet_date = extract_date_from_snippet(snippet) if snippet_date: return snippet_date, "med" # Try title title_date = extract_date_from_snippet(title) if title_date: return title_date, "med" return None, "low" # Domains to exclude (Reddit and X are handled separately) EXCLUDED_DOMAINS = { "reddit.com", "www.reddit.com", "old.reddit.com", "twitter.com", "www.twitter.com", "x.com", "www.x.com", "mobile.twitter.com", } def extract_domain(url: str) -> str: """Extract the domain from a URL. Args: url: Full URL Returns: Domain string (e.g., "medium.com") """ try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www. prefix for cleaner display if domain.startswith("www."): domain = domain[4:] return domain except Exception: return "" def is_excluded_domain(url: str) -> bool: """Check if URL is from an excluded domain (Reddit/X). Args: url: URL to check Returns: True if URL should be excluded """ try: parsed = urlparse(url) domain = parsed.netloc.lower() return domain in EXCLUDED_DOMAINS except Exception: return False def parse_websearch_results( results: List[Dict[str, Any]], topic: str, from_date: str = "", to_date: str = "", ) -> List[Dict[str, Any]]: """Parse WebSearch results into normalized format. This function expects results from Claude's WebSearch tool. Each result should have: title, url, snippet, and optionally date/relevance. Uses "Date Detective" approach: 1. Extract dates from URLs (high confidence) 2. Extract dates from snippets/titles (med confidence) 3. Hard filter: exclude items with verified old dates 4. Keep items with no date signals (with low confidence penalty) Args: results: List of WebSearch result dicts topic: Original search topic (for context) from_date: Start date for filtering (YYYY-MM-DD) to_date: End date for filtering (YYYY-MM-DD) Returns: List of normalized item dicts ready for WebSearchItem creation """ items = [] for i, result in enumerate(results): if not isinstance(result, dict): continue url = result.get("url", "") if not url: continue # Skip Reddit/X URLs (handled separately) if is_excluded_domain(url): continue title = str(result.get("title", "")).strip() snippet = str(result.get("snippet", result.get("description", ""))).strip() if not title and not snippet: continue # Use Date Detective to extract date signals date = result.get("date") # Use provided date if available date_confidence = "low" if date and re.match(r'^\d{4}-\d{2}-\d{2}$', str(date)): # Provided date is valid date_confidence = "med" else: # Try to extract date from URL/snippet/title extracted_date, confidence = extract_date_signals(url, snippet, title) if extracted_date: date = extracted_date date_confidence = confidence # Hard filter: if we found a date and it's too old, skip if date and from_date and date < from_date: continue # DROP - verified old content # Hard filter: if date is in the future, skip (parsing error) if date and to_date and date > to_date: continue # DROP - future date # Get relevance if provided, default to 0.5 relevance = result.get("relevance", 0.5) try: relevance = min(1.0, max(0.0, float(relevance))) except (TypeError, ValueError): relevance = 0.5 item = { "id": f"W{i+1}", "title": title[:200], # Truncate long titles "url": url, "source_domain": extract_domain(url), "snippet": snippet[:500], # Truncate long snippets "date": date, "date_confidence": date_confidence, "relevance": relevance, "why_relevant": str(result.get("why_relevant", "")).strip(), } items.append(item) return items def normalize_websearch_items( items: List[Dict[str, Any]], from_date: str, to_date: str, ) -> List[schema.WebSearchItem]: """Convert parsed dicts to WebSearchItem objects. Args: items: List of parsed item dicts from_date: Start of date range (YYYY-MM-DD) to_date: End of date range (YYYY-MM-DD) Returns: List of WebSearchItem objects """ result = [] for item in items: web_item = schema.WebSearchItem( id=item["id"], title=item["title"], url=item["url"], source_domain=item["source_domain"], snippet=item["snippet"], date=item.get("date"), date_confidence=item.get("date_confidence", "low"), relevance=item.get("relevance", 0.5), why_relevant=item.get("why_relevant", ""), ) result.append(web_item) return result def dedupe_websearch(items: List[schema.WebSearchItem]) -> List[schema.WebSearchItem]: """Remove duplicate WebSearch items. Deduplication is based on URL. Args: items: List of WebSearchItem objects Returns: Deduplicated list """ seen_urls = set() result = [] for item in items: # Normalize URL for comparison url_key = item.url.lower().rstrip("/") if url_key not in seen_urls: seen_urls.add(url_key) result.append(item) return result