fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/.github/workflows/scheduled-updates.yml
+++ b/.github/workflows/scheduled-updates.yml
@@ -0,0 +1,203 @@
+# Automated Skill Updates - Runs weekly to refresh documentation
+# Security Note: Schedule triggers with hardcoded constants. Workflow_dispatch input
+# accessed via FRAMEWORKS_INPUT env variable (safe pattern).
+
+name: Scheduled Skill Updates
+
+on:
+  schedule:
+    # Run every Sunday at 3 AM UTC
+    - cron: '0 3 * * 0'
+  workflow_dispatch:
+    inputs:
+      frameworks:
+        description: 'Frameworks to update (comma-separated or "all")'
+        required: false
+        default: 'all'
+        type: string
+
+jobs:
+  update-skills:
+    name: Update ${{ matrix.framework }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        # Popular frameworks to keep updated
+        framework:
+          - react
+          - django
+          - fastapi
+          - godot
+          - vue
+          - flask
+
+    env:
+      FRAMEWORK: ${{ matrix.framework }}
+      FRAMEWORKS_INPUT: ${{ github.event.inputs.frameworks }}
+
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .
+
+    - name: Check if framework should be updated
+      id: should_update
+      run: |
+        FRAMEWORKS_INPUT="${FRAMEWORKS_INPUT:-all}"
+
+        if [ "$FRAMEWORKS_INPUT" = "all" ] || [ -z "$FRAMEWORKS_INPUT" ]; then
+          echo "update=true" >> $GITHUB_OUTPUT
+        elif echo "$FRAMEWORKS_INPUT" | grep -q "$FRAMEWORK"; then
+          echo "update=true" >> $GITHUB_OUTPUT
+        else
+          echo "update=false" >> $GITHUB_OUTPUT
+          echo "⏭️  Skipping $FRAMEWORK (not in update list)"
+        fi
+
+    - name: Check for existing skill
+      if: steps.should_update.outputs.update == 'true'
+      id: check_existing
+      run: |
+        SKILL_DIR="output/$FRAMEWORK"
+        if [ -d "$SKILL_DIR" ]; then
+          echo "exists=true" >> $GITHUB_OUTPUT
+          echo "📦 Found existing skill at $SKILL_DIR"
+        else
+          echo "exists=false" >> $GITHUB_OUTPUT
+          echo "🆕 No existing skill found"
+        fi
+
+    - name: Incremental update (if exists)
+      if: steps.should_update.outputs.update == 'true' && steps.check_existing.outputs.exists == 'true'
+      run: |
+        echo "⚡ Performing incremental update for $FRAMEWORK..."
+
+        SKILL_DIR="output/$FRAMEWORK"
+
+        # Detect changes using incremental updater
+        python3 << 'EOF'
+import sys
+from pathlib import Path
+sys.path.insert(0, 'src')
+
+from skill_seekers.cli.incremental_updater import IncrementalUpdater
+import os
+
+framework = os.environ['FRAMEWORK']
+skill_dir = Path(f'output/{framework}')
+
+updater = IncrementalUpdater(skill_dir)
+changes = updater.detect_changes()
+
+if changes.has_changes:
+    print(f"🔄 Changes detected:")
+    print(f"   Added: {len(changes.added)}")
+    print(f"   Modified: {len(changes.modified)}")
+    print(f"   Deleted: {len(changes.deleted)}")
+
+    # Save current versions for next run
+    updater.current_versions = updater._scan_documents()
+    updater.save_current_versions()
+else:
+    print("✓ No changes detected, skill is up to date")
+EOF
+
+    - name: Full scrape (if new or manual)
+      if: steps.should_update.outputs.update == 'true' && steps.check_existing.outputs.exists == 'false'
+      run: |
+        echo "📥 Performing full scrape for $FRAMEWORK..."
+
+        CONFIG_FILE="configs/${FRAMEWORK}.json"
+
+        if [ ! -f "$CONFIG_FILE" ]; then
+          echo "⚠️  Config not found: $CONFIG_FILE"
+          exit 0
+        fi
+
+        # Use streaming ingestion for large docs
+        skill-seekers scrape --config "$CONFIG_FILE" --streaming --max-pages 200
+
+    - name: Generate quality report
+      if: steps.should_update.outputs.update == 'true'
+      run: |
+        SKILL_DIR="output/$FRAMEWORK"
+
+        if [ ! -d "$SKILL_DIR" ]; then
+          echo "⚠️  Skill directory not found"
+          exit 0
+        fi
+
+        echo "📊 Generating quality metrics..."
+
+        python3 << 'EOF'
+import sys
+import os
+from pathlib import Path
+sys.path.insert(0, 'src')
+
+from skill_seekers.cli.quality_metrics import QualityAnalyzer
+
+framework = os.environ['FRAMEWORK']
+skill_dir = Path(f'output/{framework}')
+
+analyzer = QualityAnalyzer(skill_dir)
+report = analyzer.generate_report()
+
+print(f"\n📊 Quality Score: {report.overall_score.grade} ({report.overall_score.total_score:.1f}/100)")
+print(f"   Completeness: {report.overall_score.completeness:.1f}%")
+print(f"   Accuracy: {report.overall_score.accuracy:.1f}%")
+print(f"   Coverage: {report.overall_score.coverage:.1f}%")
+print(f"   Health: {report.overall_score.health:.1f}%")
+EOF
+
+    - name: Package for Claude
+      if: steps.should_update.outputs.update == 'true'
+      run: |
+        SKILL_DIR="output/$FRAMEWORK"
+
+        if [ -d "$SKILL_DIR" ]; then
+          echo "📦 Packaging $FRAMEWORK for Claude AI..."
+          skill-seekers package "$SKILL_DIR" --target claude
+        fi
+
+    - name: Upload updated skill
+      if: steps.should_update.outputs.update == 'true'
+      uses: actions/upload-artifact@v3
+      with:
+        name: ${{ env.FRAMEWORK }}-skill-updated
+        path: output/${{ env.FRAMEWORK }}.zip
+        retention-days: 90
+
+  summary:
+    name: Update Summary
+    needs: update-skills
+    runs-on: ubuntu-latest
+    if: always()
+
+    steps:
+    - name: Create summary
+      run: |
+        echo "## 🔄 Scheduled Skills Update" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**Date:** $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "### Updated Frameworks" >> $GITHUB_STEP_SUMMARY
+        echo "- React" >> $GITHUB_STEP_SUMMARY
+        echo "- Django" >> $GITHUB_STEP_SUMMARY
+        echo "- FastAPI" >> $GITHUB_STEP_SUMMARY
+        echo "- Godot" >> $GITHUB_STEP_SUMMARY
+        echo "- Vue" >> $GITHUB_STEP_SUMMARY
+        echo "- Flask" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "Updated skills available in workflow artifacts." >> $GITHUB_STEP_SUMMARY