""" Change detection for documentation pages. """ import hashlib import difflib from datetime import datetime import requests from .models import PageChange, ChangeType, ChangeReport class ChangeDetector: """ Detects changes in documentation pages. Uses multiple strategies: 1. Content hashing (SHA-256) 2. Last-Modified headers 3. ETag headers 4. Content diffing Examples: detector = ChangeDetector() # Check single page change = detector.check_page( url="https://react.dev/learn", old_hash="abc123" ) # Generate diff diff = detector.generate_diff(old_content, new_content) # Check multiple pages changes = detector.check_pages(urls, previous_state) """ def __init__(self, timeout: int = 30): """ Initialize change detector. Args: timeout: Request timeout in seconds """ self.timeout = timeout def compute_hash(self, content: str) -> str: """ Compute SHA-256 hash of content. Args: content: Page content Returns: Hexadecimal hash string """ return hashlib.sha256(content.encode("utf-8")).hexdigest() def fetch_page(self, url: str) -> tuple[str, dict[str, str]]: """ Fetch page content and metadata. Args: url: Page URL Returns: Tuple of (content, metadata) metadata includes: last-modified, etag, content-type Raises: requests.RequestException: If fetch fails """ response = requests.get( url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"} ) response.raise_for_status() metadata = { "last-modified": response.headers.get("Last-Modified"), "etag": response.headers.get("ETag"), "content-type": response.headers.get("Content-Type"), "content-length": response.headers.get("Content-Length"), } return response.text, metadata def check_page( self, url: str, old_hash: str | None = None, generate_diff: bool = False, old_content: str | None = None, ) -> PageChange: """ Check if page has changed. Args: url: Page URL old_hash: Previous content hash generate_diff: Whether to generate diff old_content: Previous content (for diff generation) Returns: PageChange object Raises: requests.RequestException: If fetch fails """ try: content, metadata = self.fetch_page(url) new_hash = self.compute_hash(content) # Determine change type if old_hash is None: change_type = ChangeType.ADDED elif old_hash == new_hash: change_type = ChangeType.UNCHANGED else: change_type = ChangeType.MODIFIED # Generate diff if requested diff = None if generate_diff and old_content and change_type == ChangeType.MODIFIED: diff = self.generate_diff(old_content, content) return PageChange( url=url, change_type=change_type, old_hash=old_hash, new_hash=new_hash, diff=diff, detected_at=datetime.utcnow(), ) except requests.RequestException: # Page might be deleted or temporarily unavailable return PageChange( url=url, change_type=ChangeType.DELETED, old_hash=old_hash, new_hash=None, detected_at=datetime.utcnow(), ) def check_pages( self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False ) -> ChangeReport: """ Check multiple pages for changes. Args: urls: List of URLs to check previous_hashes: URL -> hash mapping from previous state generate_diffs: Whether to generate diffs Returns: ChangeReport with all detected changes """ added = [] modified = [] deleted = [] unchanged_count = 0 # Check each URL checked_urls = set() for url in urls: checked_urls.add(url) old_hash = previous_hashes.get(url) change = self.check_page(url, old_hash, generate_diff=generate_diffs) if change.change_type == ChangeType.ADDED: added.append(change) elif change.change_type == ChangeType.MODIFIED: modified.append(change) elif change.change_type == ChangeType.UNCHANGED: unchanged_count += 1 # Check for deleted pages (in previous state but not in current) for url, old_hash in previous_hashes.items(): if url not in checked_urls: deleted.append( PageChange( url=url, change_type=ChangeType.DELETED, old_hash=old_hash, new_hash=None, detected_at=datetime.utcnow(), ) ) return ChangeReport( skill_name="unknown", # To be set by caller total_pages=len(urls), added=added, modified=modified, deleted=deleted, unchanged=unchanged_count, checked_at=datetime.utcnow(), ) def generate_diff(self, old_content: str, new_content: str) -> str: """ Generate unified diff between old and new content. Args: old_content: Original content new_content: New content Returns: Unified diff string """ old_lines = old_content.splitlines(keepends=True) new_lines = new_content.splitlines(keepends=True) diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="") return "".join(diff) def generate_summary_diff(self, old_content: str, new_content: str) -> str: """ Generate human-readable diff summary. Args: old_content: Original content new_content: New content Returns: Summary string with added/removed line counts """ old_lines = old_content.splitlines() new_lines = new_content.splitlines() diff = difflib.unified_diff(old_lines, new_lines) diff_lines = list(diff) added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++")) removed = sum( 1 for line in diff_lines if line.startswith("-") and not line.startswith("---") ) return f"+{added} -{removed} lines" def check_header_changes( self, url: str, old_modified: str | None = None, old_etag: str | None = None ) -> bool: """ Quick check using HTTP headers (no content download). Args: url: Page URL old_modified: Previous Last-Modified header old_etag: Previous ETag header Returns: True if headers indicate change, False otherwise """ try: # Use HEAD request for efficiency response = requests.head( url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"} ) response.raise_for_status() new_modified = response.headers.get("Last-Modified") new_etag = response.headers.get("ETag") # Check if headers indicate change if old_modified and new_modified and old_modified != new_modified: return True return bool(old_etag and new_etag and old_etag != new_etag) except requests.RequestException: # If HEAD request fails, assume change (will be verified with GET) return True def batch_check_headers( self, urls: list[str], previous_metadata: dict[str, dict[str, str]] ) -> list[str]: """ Batch check URLs using headers only. Args: urls: URLs to check previous_metadata: URL -> metadata mapping Returns: List of URLs that likely changed """ changed_urls = [] for url in urls: old_meta = previous_metadata.get(url, {}) old_modified = old_meta.get("last-modified") old_etag = old_meta.get("etag") if self.check_header_changes(url, old_modified, old_etag): changed_urls.append(url) return changed_urls