fix: Enforce min_chunk_size in RAG chunker
- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
321
src/skill_seekers/sync/detector.py
Normal file
321
src/skill_seekers/sync/detector.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Change detection for documentation pages.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import difflib
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
from .models import PageChange, ChangeType, ChangeReport
|
||||
|
||||
|
||||
class ChangeDetector:
|
||||
"""
|
||||
Detects changes in documentation pages.
|
||||
|
||||
Uses multiple strategies:
|
||||
1. Content hashing (SHA-256)
|
||||
2. Last-Modified headers
|
||||
3. ETag headers
|
||||
4. Content diffing
|
||||
|
||||
Examples:
|
||||
detector = ChangeDetector()
|
||||
|
||||
# Check single page
|
||||
change = detector.check_page(
|
||||
url="https://react.dev/learn",
|
||||
old_hash="abc123"
|
||||
)
|
||||
|
||||
# Generate diff
|
||||
diff = detector.generate_diff(old_content, new_content)
|
||||
|
||||
# Check multiple pages
|
||||
changes = detector.check_pages(urls, previous_state)
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: int = 30):
|
||||
"""
|
||||
Initialize change detector.
|
||||
|
||||
Args:
|
||||
timeout: Request timeout in seconds
|
||||
"""
|
||||
self.timeout = timeout
|
||||
|
||||
def compute_hash(self, content: str) -> str:
|
||||
"""
|
||||
Compute SHA-256 hash of content.
|
||||
|
||||
Args:
|
||||
content: Page content
|
||||
|
||||
Returns:
|
||||
Hexadecimal hash string
|
||||
"""
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
|
||||
def fetch_page(self, url: str) -> Tuple[str, Dict[str, str]]:
|
||||
"""
|
||||
Fetch page content and metadata.
|
||||
|
||||
Args:
|
||||
url: Page URL
|
||||
|
||||
Returns:
|
||||
Tuple of (content, metadata)
|
||||
metadata includes: last-modified, etag, content-type
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If fetch fails
|
||||
"""
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
metadata = {
|
||||
'last-modified': response.headers.get('Last-Modified'),
|
||||
'etag': response.headers.get('ETag'),
|
||||
'content-type': response.headers.get('Content-Type'),
|
||||
'content-length': response.headers.get('Content-Length'),
|
||||
}
|
||||
|
||||
return response.text, metadata
|
||||
|
||||
def check_page(
|
||||
self,
|
||||
url: str,
|
||||
old_hash: Optional[str] = None,
|
||||
generate_diff: bool = False,
|
||||
old_content: Optional[str] = None
|
||||
) -> PageChange:
|
||||
"""
|
||||
Check if page has changed.
|
||||
|
||||
Args:
|
||||
url: Page URL
|
||||
old_hash: Previous content hash
|
||||
generate_diff: Whether to generate diff
|
||||
old_content: Previous content (for diff generation)
|
||||
|
||||
Returns:
|
||||
PageChange object
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If fetch fails
|
||||
"""
|
||||
try:
|
||||
content, metadata = self.fetch_page(url)
|
||||
new_hash = self.compute_hash(content)
|
||||
|
||||
# Determine change type
|
||||
if old_hash is None:
|
||||
change_type = ChangeType.ADDED
|
||||
elif old_hash == new_hash:
|
||||
change_type = ChangeType.UNCHANGED
|
||||
else:
|
||||
change_type = ChangeType.MODIFIED
|
||||
|
||||
# Generate diff if requested
|
||||
diff = None
|
||||
if generate_diff and old_content and change_type == ChangeType.MODIFIED:
|
||||
diff = self.generate_diff(old_content, content)
|
||||
|
||||
return PageChange(
|
||||
url=url,
|
||||
change_type=change_type,
|
||||
old_hash=old_hash,
|
||||
new_hash=new_hash,
|
||||
diff=diff,
|
||||
detected_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.RequestException as e:
|
||||
# Page might be deleted or temporarily unavailable
|
||||
return PageChange(
|
||||
url=url,
|
||||
change_type=ChangeType.DELETED,
|
||||
old_hash=old_hash,
|
||||
new_hash=None,
|
||||
detected_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
def check_pages(
|
||||
self,
|
||||
urls: List[str],
|
||||
previous_hashes: Dict[str, str],
|
||||
generate_diffs: bool = False
|
||||
) -> ChangeReport:
|
||||
"""
|
||||
Check multiple pages for changes.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to check
|
||||
previous_hashes: URL -> hash mapping from previous state
|
||||
generate_diffs: Whether to generate diffs
|
||||
|
||||
Returns:
|
||||
ChangeReport with all detected changes
|
||||
"""
|
||||
added = []
|
||||
modified = []
|
||||
deleted = []
|
||||
unchanged_count = 0
|
||||
|
||||
# Check each URL
|
||||
checked_urls = set()
|
||||
for url in urls:
|
||||
checked_urls.add(url)
|
||||
old_hash = previous_hashes.get(url)
|
||||
|
||||
change = self.check_page(url, old_hash, generate_diff=generate_diffs)
|
||||
|
||||
if change.change_type == ChangeType.ADDED:
|
||||
added.append(change)
|
||||
elif change.change_type == ChangeType.MODIFIED:
|
||||
modified.append(change)
|
||||
elif change.change_type == ChangeType.UNCHANGED:
|
||||
unchanged_count += 1
|
||||
|
||||
# Check for deleted pages (in previous state but not in current)
|
||||
for url, old_hash in previous_hashes.items():
|
||||
if url not in checked_urls:
|
||||
deleted.append(PageChange(
|
||||
url=url,
|
||||
change_type=ChangeType.DELETED,
|
||||
old_hash=old_hash,
|
||||
new_hash=None,
|
||||
detected_at=datetime.utcnow()
|
||||
))
|
||||
|
||||
return ChangeReport(
|
||||
skill_name="unknown", # To be set by caller
|
||||
total_pages=len(urls),
|
||||
added=added,
|
||||
modified=modified,
|
||||
deleted=deleted,
|
||||
unchanged=unchanged_count,
|
||||
checked_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
def generate_diff(self, old_content: str, new_content: str) -> str:
|
||||
"""
|
||||
Generate unified diff between old and new content.
|
||||
|
||||
Args:
|
||||
old_content: Original content
|
||||
new_content: New content
|
||||
|
||||
Returns:
|
||||
Unified diff string
|
||||
"""
|
||||
old_lines = old_content.splitlines(keepends=True)
|
||||
new_lines = new_content.splitlines(keepends=True)
|
||||
|
||||
diff = difflib.unified_diff(
|
||||
old_lines,
|
||||
new_lines,
|
||||
fromfile='old',
|
||||
tofile='new',
|
||||
lineterm=''
|
||||
)
|
||||
|
||||
return ''.join(diff)
|
||||
|
||||
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
|
||||
"""
|
||||
Generate human-readable diff summary.
|
||||
|
||||
Args:
|
||||
old_content: Original content
|
||||
new_content: New content
|
||||
|
||||
Returns:
|
||||
Summary string with added/removed line counts
|
||||
"""
|
||||
old_lines = old_content.splitlines()
|
||||
new_lines = new_content.splitlines()
|
||||
|
||||
diff = difflib.unified_diff(old_lines, new_lines)
|
||||
diff_lines = list(diff)
|
||||
|
||||
added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
|
||||
removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
|
||||
|
||||
return f"+{added} -{removed} lines"
|
||||
|
||||
def check_header_changes(
|
||||
self,
|
||||
url: str,
|
||||
old_modified: Optional[str] = None,
|
||||
old_etag: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Quick check using HTTP headers (no content download).
|
||||
|
||||
Args:
|
||||
url: Page URL
|
||||
old_modified: Previous Last-Modified header
|
||||
old_etag: Previous ETag header
|
||||
|
||||
Returns:
|
||||
True if headers indicate change, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Use HEAD request for efficiency
|
||||
response = requests.head(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
new_modified = response.headers.get('Last-Modified')
|
||||
new_etag = response.headers.get('ETag')
|
||||
|
||||
# Check if headers indicate change
|
||||
if old_modified and new_modified and old_modified != new_modified:
|
||||
return True
|
||||
|
||||
if old_etag and new_etag and old_etag != new_etag:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except requests.RequestException:
|
||||
# If HEAD request fails, assume change (will be verified with GET)
|
||||
return True
|
||||
|
||||
def batch_check_headers(
|
||||
self,
|
||||
urls: List[str],
|
||||
previous_metadata: Dict[str, Dict[str, str]]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Batch check URLs using headers only.
|
||||
|
||||
Args:
|
||||
urls: URLs to check
|
||||
previous_metadata: URL -> metadata mapping
|
||||
|
||||
Returns:
|
||||
List of URLs that likely changed
|
||||
"""
|
||||
changed_urls = []
|
||||
|
||||
for url in urls:
|
||||
old_meta = previous_metadata.get(url, {})
|
||||
old_modified = old_meta.get('last-modified')
|
||||
old_etag = old_meta.get('etag')
|
||||
|
||||
if self.check_header_changes(url, old_modified, old_etag):
|
||||
changed_urls.append(url)
|
||||
|
||||
return changed_urls
|
||||
Reference in New Issue
Block a user