fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/src/skill_seekers/sync/detector.py
+++ b/src/skill_seekers/sync/detector.py
@@ -0,0 +1,321 @@
+"""
+Change detection for documentation pages.
+"""
+
+import hashlib
+import difflib
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+import requests
+from pathlib import Path
+
+from .models import PageChange, ChangeType, ChangeReport
+
+
+class ChangeDetector:
+    """
+    Detects changes in documentation pages.
+
+    Uses multiple strategies:
+    1. Content hashing (SHA-256)
+    2. Last-Modified headers
+    3. ETag headers
+    4. Content diffing
+
+    Examples:
+        detector = ChangeDetector()
+
+        # Check single page
+        change = detector.check_page(
+            url="https://react.dev/learn",
+            old_hash="abc123"
+        )
+
+        # Generate diff
+        diff = detector.generate_diff(old_content, new_content)
+
+        # Check multiple pages
+        changes = detector.check_pages(urls, previous_state)
+    """
+
+    def __init__(self, timeout: int = 30):
+        """
+        Initialize change detector.
+
+        Args:
+            timeout: Request timeout in seconds
+        """
+        self.timeout = timeout
+
+    def compute_hash(self, content: str) -> str:
+        """
+        Compute SHA-256 hash of content.
+
+        Args:
+            content: Page content
+
+        Returns:
+            Hexadecimal hash string
+        """
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()
+
+    def fetch_page(self, url: str) -> Tuple[str, Dict[str, str]]:
+        """
+        Fetch page content and metadata.
+
+        Args:
+            url: Page URL
+
+        Returns:
+            Tuple of (content, metadata)
+            metadata includes: last-modified, etag, content-type
+
+        Raises:
+            requests.RequestException: If fetch fails
+        """
+        response = requests.get(
+            url,
+            timeout=self.timeout,
+            headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
+        )
+        response.raise_for_status()
+
+        metadata = {
+            'last-modified': response.headers.get('Last-Modified'),
+            'etag': response.headers.get('ETag'),
+            'content-type': response.headers.get('Content-Type'),
+            'content-length': response.headers.get('Content-Length'),
+        }
+
+        return response.text, metadata
+
+    def check_page(
+        self,
+        url: str,
+        old_hash: Optional[str] = None,
+        generate_diff: bool = False,
+        old_content: Optional[str] = None
+    ) -> PageChange:
+        """
+        Check if page has changed.
+
+        Args:
+            url: Page URL
+            old_hash: Previous content hash
+            generate_diff: Whether to generate diff
+            old_content: Previous content (for diff generation)
+
+        Returns:
+            PageChange object
+
+        Raises:
+            requests.RequestException: If fetch fails
+        """
+        try:
+            content, metadata = self.fetch_page(url)
+            new_hash = self.compute_hash(content)
+
+            # Determine change type
+            if old_hash is None:
+                change_type = ChangeType.ADDED
+            elif old_hash == new_hash:
+                change_type = ChangeType.UNCHANGED
+            else:
+                change_type = ChangeType.MODIFIED
+
+            # Generate diff if requested
+            diff = None
+            if generate_diff and old_content and change_type == ChangeType.MODIFIED:
+                diff = self.generate_diff(old_content, content)
+
+            return PageChange(
+                url=url,
+                change_type=change_type,
+                old_hash=old_hash,
+                new_hash=new_hash,
+                diff=diff,
+                detected_at=datetime.utcnow()
+            )
+
+        except requests.RequestException as e:
+            # Page might be deleted or temporarily unavailable
+            return PageChange(
+                url=url,
+                change_type=ChangeType.DELETED,
+                old_hash=old_hash,
+                new_hash=None,
+                detected_at=datetime.utcnow()
+            )
+
+    def check_pages(
+        self,
+        urls: List[str],
+        previous_hashes: Dict[str, str],
+        generate_diffs: bool = False
+    ) -> ChangeReport:
+        """
+        Check multiple pages for changes.
+
+        Args:
+            urls: List of URLs to check
+            previous_hashes: URL -> hash mapping from previous state
+            generate_diffs: Whether to generate diffs
+
+        Returns:
+            ChangeReport with all detected changes
+        """
+        added = []
+        modified = []
+        deleted = []
+        unchanged_count = 0
+
+        # Check each URL
+        checked_urls = set()
+        for url in urls:
+            checked_urls.add(url)
+            old_hash = previous_hashes.get(url)
+
+            change = self.check_page(url, old_hash, generate_diff=generate_diffs)
+
+            if change.change_type == ChangeType.ADDED:
+                added.append(change)
+            elif change.change_type == ChangeType.MODIFIED:
+                modified.append(change)
+            elif change.change_type == ChangeType.UNCHANGED:
+                unchanged_count += 1
+
+        # Check for deleted pages (in previous state but not in current)
+        for url, old_hash in previous_hashes.items():
+            if url not in checked_urls:
+                deleted.append(PageChange(
+                    url=url,
+                    change_type=ChangeType.DELETED,
+                    old_hash=old_hash,
+                    new_hash=None,
+                    detected_at=datetime.utcnow()
+                ))
+
+        return ChangeReport(
+            skill_name="unknown",  # To be set by caller
+            total_pages=len(urls),
+            added=added,
+            modified=modified,
+            deleted=deleted,
+            unchanged=unchanged_count,
+            checked_at=datetime.utcnow()
+        )
+
+    def generate_diff(self, old_content: str, new_content: str) -> str:
+        """
+        Generate unified diff between old and new content.
+
+        Args:
+            old_content: Original content
+            new_content: New content
+
+        Returns:
+            Unified diff string
+        """
+        old_lines = old_content.splitlines(keepends=True)
+        new_lines = new_content.splitlines(keepends=True)
+
+        diff = difflib.unified_diff(
+            old_lines,
+            new_lines,
+            fromfile='old',
+            tofile='new',
+            lineterm=''
+        )
+
+        return ''.join(diff)
+
+    def generate_summary_diff(self, old_content: str, new_content: str) -> str:
+        """
+        Generate human-readable diff summary.
+
+        Args:
+            old_content: Original content
+            new_content: New content
+
+        Returns:
+            Summary string with added/removed line counts
+        """
+        old_lines = old_content.splitlines()
+        new_lines = new_content.splitlines()
+
+        diff = difflib.unified_diff(old_lines, new_lines)
+        diff_lines = list(diff)
+
+        added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
+        removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
+
+        return f"+{added} -{removed} lines"
+
+    def check_header_changes(
+        self,
+        url: str,
+        old_modified: Optional[str] = None,
+        old_etag: Optional[str] = None
+    ) -> bool:
+        """
+        Quick check using HTTP headers (no content download).
+
+        Args:
+            url: Page URL
+            old_modified: Previous Last-Modified header
+            old_etag: Previous ETag header
+
+        Returns:
+            True if headers indicate change, False otherwise
+        """
+        try:
+            # Use HEAD request for efficiency
+            response = requests.head(
+                url,
+                timeout=self.timeout,
+                headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
+            )
+            response.raise_for_status()
+
+            new_modified = response.headers.get('Last-Modified')
+            new_etag = response.headers.get('ETag')
+
+            # Check if headers indicate change
+            if old_modified and new_modified and old_modified != new_modified:
+                return True
+
+            if old_etag and new_etag and old_etag != new_etag:
+                return True
+
+            return False
+
+        except requests.RequestException:
+            # If HEAD request fails, assume change (will be verified with GET)
+            return True
+
+    def batch_check_headers(
+        self,
+        urls: List[str],
+        previous_metadata: Dict[str, Dict[str, str]]
+    ) -> List[str]:
+        """
+        Batch check URLs using headers only.
+
+        Args:
+            urls: URLs to check
+            previous_metadata: URL -> metadata mapping
+
+        Returns:
+            List of URLs that likely changed
+        """
+        changed_urls = []
+
+        for url in urls:
+            old_meta = previous_metadata.get(url, {})
+            old_modified = old_meta.get('last-modified')
+            old_etag = old_meta.get('etag')
+
+            if self.check_header_changes(url, old_modified, old_etag):
+                changed_urls.append(url)
+
+        return changed_urls