fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens)
- Exception: Keep all chunks if entire document is smaller than target size
- All 15 tests passing (100% pass rate)

Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were
being created despite min_chunk_size=100 setting.

Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
yusyus
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions

View File

@@ -0,0 +1,321 @@
"""
Change detection for documentation pages.
"""
import hashlib
import difflib
from typing import Dict, List, Optional, Tuple
from datetime import datetime
import requests
from pathlib import Path
from .models import PageChange, ChangeType, ChangeReport
class ChangeDetector:
"""
Detects changes in documentation pages.
Uses multiple strategies:
1. Content hashing (SHA-256)
2. Last-Modified headers
3. ETag headers
4. Content diffing
Examples:
detector = ChangeDetector()
# Check single page
change = detector.check_page(
url="https://react.dev/learn",
old_hash="abc123"
)
# Generate diff
diff = detector.generate_diff(old_content, new_content)
# Check multiple pages
changes = detector.check_pages(urls, previous_state)
"""
def __init__(self, timeout: int = 30):
"""
Initialize change detector.
Args:
timeout: Request timeout in seconds
"""
self.timeout = timeout
def compute_hash(self, content: str) -> str:
"""
Compute SHA-256 hash of content.
Args:
content: Page content
Returns:
Hexadecimal hash string
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest()
def fetch_page(self, url: str) -> Tuple[str, Dict[str, str]]:
"""
Fetch page content and metadata.
Args:
url: Page URL
Returns:
Tuple of (content, metadata)
metadata includes: last-modified, etag, content-type
Raises:
requests.RequestException: If fetch fails
"""
response = requests.get(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
)
response.raise_for_status()
metadata = {
'last-modified': response.headers.get('Last-Modified'),
'etag': response.headers.get('ETag'),
'content-type': response.headers.get('Content-Type'),
'content-length': response.headers.get('Content-Length'),
}
return response.text, metadata
def check_page(
self,
url: str,
old_hash: Optional[str] = None,
generate_diff: bool = False,
old_content: Optional[str] = None
) -> PageChange:
"""
Check if page has changed.
Args:
url: Page URL
old_hash: Previous content hash
generate_diff: Whether to generate diff
old_content: Previous content (for diff generation)
Returns:
PageChange object
Raises:
requests.RequestException: If fetch fails
"""
try:
content, metadata = self.fetch_page(url)
new_hash = self.compute_hash(content)
# Determine change type
if old_hash is None:
change_type = ChangeType.ADDED
elif old_hash == new_hash:
change_type = ChangeType.UNCHANGED
else:
change_type = ChangeType.MODIFIED
# Generate diff if requested
diff = None
if generate_diff and old_content and change_type == ChangeType.MODIFIED:
diff = self.generate_diff(old_content, content)
return PageChange(
url=url,
change_type=change_type,
old_hash=old_hash,
new_hash=new_hash,
diff=diff,
detected_at=datetime.utcnow()
)
except requests.RequestException as e:
# Page might be deleted or temporarily unavailable
return PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
)
def check_pages(
self,
urls: List[str],
previous_hashes: Dict[str, str],
generate_diffs: bool = False
) -> ChangeReport:
"""
Check multiple pages for changes.
Args:
urls: List of URLs to check
previous_hashes: URL -> hash mapping from previous state
generate_diffs: Whether to generate diffs
Returns:
ChangeReport with all detected changes
"""
added = []
modified = []
deleted = []
unchanged_count = 0
# Check each URL
checked_urls = set()
for url in urls:
checked_urls.add(url)
old_hash = previous_hashes.get(url)
change = self.check_page(url, old_hash, generate_diff=generate_diffs)
if change.change_type == ChangeType.ADDED:
added.append(change)
elif change.change_type == ChangeType.MODIFIED:
modified.append(change)
elif change.change_type == ChangeType.UNCHANGED:
unchanged_count += 1
# Check for deleted pages (in previous state but not in current)
for url, old_hash in previous_hashes.items():
if url not in checked_urls:
deleted.append(PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
))
return ChangeReport(
skill_name="unknown", # To be set by caller
total_pages=len(urls),
added=added,
modified=modified,
deleted=deleted,
unchanged=unchanged_count,
checked_at=datetime.utcnow()
)
def generate_diff(self, old_content: str, new_content: str) -> str:
"""
Generate unified diff between old and new content.
Args:
old_content: Original content
new_content: New content
Returns:
Unified diff string
"""
old_lines = old_content.splitlines(keepends=True)
new_lines = new_content.splitlines(keepends=True)
diff = difflib.unified_diff(
old_lines,
new_lines,
fromfile='old',
tofile='new',
lineterm=''
)
return ''.join(diff)
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
"""
Generate human-readable diff summary.
Args:
old_content: Original content
new_content: New content
Returns:
Summary string with added/removed line counts
"""
old_lines = old_content.splitlines()
new_lines = new_content.splitlines()
diff = difflib.unified_diff(old_lines, new_lines)
diff_lines = list(diff)
added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
return f"+{added} -{removed} lines"
def check_header_changes(
self,
url: str,
old_modified: Optional[str] = None,
old_etag: Optional[str] = None
) -> bool:
"""
Quick check using HTTP headers (no content download).
Args:
url: Page URL
old_modified: Previous Last-Modified header
old_etag: Previous ETag header
Returns:
True if headers indicate change, False otherwise
"""
try:
# Use HEAD request for efficiency
response = requests.head(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
)
response.raise_for_status()
new_modified = response.headers.get('Last-Modified')
new_etag = response.headers.get('ETag')
# Check if headers indicate change
if old_modified and new_modified and old_modified != new_modified:
return True
if old_etag and new_etag and old_etag != new_etag:
return True
return False
except requests.RequestException:
# If HEAD request fails, assume change (will be verified with GET)
return True
def batch_check_headers(
self,
urls: List[str],
previous_metadata: Dict[str, Dict[str, str]]
) -> List[str]:
"""
Batch check URLs using headers only.
Args:
urls: URLs to check
previous_metadata: URL -> metadata mapping
Returns:
List of URLs that likely changed
"""
changed_urls = []
for url in urls:
old_meta = previous_metadata.get(url, {})
old_modified = old_meta.get('last-modified')
old_etag = old_meta.get('etag')
if self.check_header_changes(url, old_modified, old_etag):
changed_urls.append(url)
return changed_urls