Files
skill-seekers-reference/src/skill_seekers/sync/detector.py
yusyus 0265de5816 style: Format all Python files with ruff
- Formatted 103 files to comply with ruff format requirements
- No code logic changes, only formatting/whitespace
- Fixes CI formatting check failures
2026-02-08 14:42:27 +03:00

303 lines
8.8 KiB
Python

"""
Change detection for documentation pages.
"""
import hashlib
import difflib
from datetime import datetime
import requests
from .models import PageChange, ChangeType, ChangeReport
class ChangeDetector:
"""
Detects changes in documentation pages.
Uses multiple strategies:
1. Content hashing (SHA-256)
2. Last-Modified headers
3. ETag headers
4. Content diffing
Examples:
detector = ChangeDetector()
# Check single page
change = detector.check_page(
url="https://react.dev/learn",
old_hash="abc123"
)
# Generate diff
diff = detector.generate_diff(old_content, new_content)
# Check multiple pages
changes = detector.check_pages(urls, previous_state)
"""
def __init__(self, timeout: int = 30):
"""
Initialize change detector.
Args:
timeout: Request timeout in seconds
"""
self.timeout = timeout
def compute_hash(self, content: str) -> str:
"""
Compute SHA-256 hash of content.
Args:
content: Page content
Returns:
Hexadecimal hash string
"""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def fetch_page(self, url: str) -> tuple[str, dict[str, str]]:
"""
Fetch page content and metadata.
Args:
url: Page URL
Returns:
Tuple of (content, metadata)
metadata includes: last-modified, etag, content-type
Raises:
requests.RequestException: If fetch fails
"""
response = requests.get(
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
metadata = {
"last-modified": response.headers.get("Last-Modified"),
"etag": response.headers.get("ETag"),
"content-type": response.headers.get("Content-Type"),
"content-length": response.headers.get("Content-Length"),
}
return response.text, metadata
def check_page(
self,
url: str,
old_hash: str | None = None,
generate_diff: bool = False,
old_content: str | None = None,
) -> PageChange:
"""
Check if page has changed.
Args:
url: Page URL
old_hash: Previous content hash
generate_diff: Whether to generate diff
old_content: Previous content (for diff generation)
Returns:
PageChange object
Raises:
requests.RequestException: If fetch fails
"""
try:
content, metadata = self.fetch_page(url)
new_hash = self.compute_hash(content)
# Determine change type
if old_hash is None:
change_type = ChangeType.ADDED
elif old_hash == new_hash:
change_type = ChangeType.UNCHANGED
else:
change_type = ChangeType.MODIFIED
# Generate diff if requested
diff = None
if generate_diff and old_content and change_type == ChangeType.MODIFIED:
diff = self.generate_diff(old_content, content)
return PageChange(
url=url,
change_type=change_type,
old_hash=old_hash,
new_hash=new_hash,
diff=diff,
detected_at=datetime.utcnow(),
)
except requests.RequestException:
# Page might be deleted or temporarily unavailable
return PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow(),
)
def check_pages(
self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False
) -> ChangeReport:
"""
Check multiple pages for changes.
Args:
urls: List of URLs to check
previous_hashes: URL -> hash mapping from previous state
generate_diffs: Whether to generate diffs
Returns:
ChangeReport with all detected changes
"""
added = []
modified = []
deleted = []
unchanged_count = 0
# Check each URL
checked_urls = set()
for url in urls:
checked_urls.add(url)
old_hash = previous_hashes.get(url)
change = self.check_page(url, old_hash, generate_diff=generate_diffs)
if change.change_type == ChangeType.ADDED:
added.append(change)
elif change.change_type == ChangeType.MODIFIED:
modified.append(change)
elif change.change_type == ChangeType.UNCHANGED:
unchanged_count += 1
# Check for deleted pages (in previous state but not in current)
for url, old_hash in previous_hashes.items():
if url not in checked_urls:
deleted.append(
PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow(),
)
)
return ChangeReport(
skill_name="unknown", # To be set by caller
total_pages=len(urls),
added=added,
modified=modified,
deleted=deleted,
unchanged=unchanged_count,
checked_at=datetime.utcnow(),
)
def generate_diff(self, old_content: str, new_content: str) -> str:
"""
Generate unified diff between old and new content.
Args:
old_content: Original content
new_content: New content
Returns:
Unified diff string
"""
old_lines = old_content.splitlines(keepends=True)
new_lines = new_content.splitlines(keepends=True)
diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="")
return "".join(diff)
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
"""
Generate human-readable diff summary.
Args:
old_content: Original content
new_content: New content
Returns:
Summary string with added/removed line counts
"""
old_lines = old_content.splitlines()
new_lines = new_content.splitlines()
diff = difflib.unified_diff(old_lines, new_lines)
diff_lines = list(diff)
added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++"))
removed = sum(
1 for line in diff_lines if line.startswith("-") and not line.startswith("---")
)
return f"+{added} -{removed} lines"
def check_header_changes(
self, url: str, old_modified: str | None = None, old_etag: str | None = None
) -> bool:
"""
Quick check using HTTP headers (no content download).
Args:
url: Page URL
old_modified: Previous Last-Modified header
old_etag: Previous ETag header
Returns:
True if headers indicate change, False otherwise
"""
try:
# Use HEAD request for efficiency
response = requests.head(
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
new_modified = response.headers.get("Last-Modified")
new_etag = response.headers.get("ETag")
# Check if headers indicate change
if old_modified and new_modified and old_modified != new_modified:
return True
return bool(old_etag and new_etag and old_etag != new_etag)
except requests.RequestException:
# If HEAD request fails, assume change (will be verified with GET)
return True
def batch_check_headers(
self, urls: list[str], previous_metadata: dict[str, dict[str, str]]
) -> list[str]:
"""
Batch check URLs using headers only.
Args:
urls: URLs to check
previous_metadata: URL -> metadata mapping
Returns:
List of URLs that likely changed
"""
changed_urls = []
for url in urls:
old_meta = previous_metadata.get(url, {})
old_modified = old_meta.get("last-modified")
old_etag = old_meta.get("etag")
if self.check_header_changes(url, old_modified, old_etag):
changed_urls.append(url)
return changed_urls