fix: Enforce min_chunk_size in RAG chunker
- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
203
.github/workflows/scheduled-updates.yml
vendored
Normal file
203
.github/workflows/scheduled-updates.yml
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
# Automated Skill Updates - Runs weekly to refresh documentation
|
||||
# Security Note: Schedule triggers with hardcoded constants. Workflow_dispatch input
|
||||
# accessed via FRAMEWORKS_INPUT env variable (safe pattern).
|
||||
|
||||
name: Scheduled Skill Updates
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run every Sunday at 3 AM UTC
|
||||
- cron: '0 3 * * 0'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
frameworks:
|
||||
description: 'Frameworks to update (comma-separated or "all")'
|
||||
required: false
|
||||
default: 'all'
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
update-skills:
|
||||
name: Update ${{ matrix.framework }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# Popular frameworks to keep updated
|
||||
framework:
|
||||
- react
|
||||
- django
|
||||
- fastapi
|
||||
- godot
|
||||
- vue
|
||||
- flask
|
||||
|
||||
env:
|
||||
FRAMEWORK: ${{ matrix.framework }}
|
||||
FRAMEWORKS_INPUT: ${{ github.event.inputs.frameworks }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -e .
|
||||
|
||||
- name: Check if framework should be updated
|
||||
id: should_update
|
||||
run: |
|
||||
FRAMEWORKS_INPUT="${FRAMEWORKS_INPUT:-all}"
|
||||
|
||||
if [ "$FRAMEWORKS_INPUT" = "all" ] || [ -z "$FRAMEWORKS_INPUT" ]; then
|
||||
echo "update=true" >> $GITHUB_OUTPUT
|
||||
elif echo "$FRAMEWORKS_INPUT" | grep -q "$FRAMEWORK"; then
|
||||
echo "update=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "update=false" >> $GITHUB_OUTPUT
|
||||
echo "⏭️ Skipping $FRAMEWORK (not in update list)"
|
||||
fi
|
||||
|
||||
- name: Check for existing skill
|
||||
if: steps.should_update.outputs.update == 'true'
|
||||
id: check_existing
|
||||
run: |
|
||||
SKILL_DIR="output/$FRAMEWORK"
|
||||
if [ -d "$SKILL_DIR" ]; then
|
||||
echo "exists=true" >> $GITHUB_OUTPUT
|
||||
echo "📦 Found existing skill at $SKILL_DIR"
|
||||
else
|
||||
echo "exists=false" >> $GITHUB_OUTPUT
|
||||
echo "🆕 No existing skill found"
|
||||
fi
|
||||
|
||||
- name: Incremental update (if exists)
|
||||
if: steps.should_update.outputs.update == 'true' && steps.check_existing.outputs.exists == 'true'
|
||||
run: |
|
||||
echo "⚡ Performing incremental update for $FRAMEWORK..."
|
||||
|
||||
SKILL_DIR="output/$FRAMEWORK"
|
||||
|
||||
# Detect changes using incremental updater
|
||||
python3 << 'EOF'
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from skill_seekers.cli.incremental_updater import IncrementalUpdater
|
||||
import os
|
||||
|
||||
framework = os.environ['FRAMEWORK']
|
||||
skill_dir = Path(f'output/{framework}')
|
||||
|
||||
updater = IncrementalUpdater(skill_dir)
|
||||
changes = updater.detect_changes()
|
||||
|
||||
if changes.has_changes:
|
||||
print(f"🔄 Changes detected:")
|
||||
print(f" Added: {len(changes.added)}")
|
||||
print(f" Modified: {len(changes.modified)}")
|
||||
print(f" Deleted: {len(changes.deleted)}")
|
||||
|
||||
# Save current versions for next run
|
||||
updater.current_versions = updater._scan_documents()
|
||||
updater.save_current_versions()
|
||||
else:
|
||||
print("✓ No changes detected, skill is up to date")
|
||||
EOF
|
||||
|
||||
- name: Full scrape (if new or manual)
|
||||
if: steps.should_update.outputs.update == 'true' && steps.check_existing.outputs.exists == 'false'
|
||||
run: |
|
||||
echo "📥 Performing full scrape for $FRAMEWORK..."
|
||||
|
||||
CONFIG_FILE="configs/${FRAMEWORK}.json"
|
||||
|
||||
if [ ! -f "$CONFIG_FILE" ]; then
|
||||
echo "⚠️ Config not found: $CONFIG_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Use streaming ingestion for large docs
|
||||
skill-seekers scrape --config "$CONFIG_FILE" --streaming --max-pages 200
|
||||
|
||||
- name: Generate quality report
|
||||
if: steps.should_update.outputs.update == 'true'
|
||||
run: |
|
||||
SKILL_DIR="output/$FRAMEWORK"
|
||||
|
||||
if [ ! -d "$SKILL_DIR" ]; then
|
||||
echo "⚠️ Skill directory not found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "📊 Generating quality metrics..."
|
||||
|
||||
python3 << 'EOF'
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from skill_seekers.cli.quality_metrics import QualityAnalyzer
|
||||
|
||||
framework = os.environ['FRAMEWORK']
|
||||
skill_dir = Path(f'output/{framework}')
|
||||
|
||||
analyzer = QualityAnalyzer(skill_dir)
|
||||
report = analyzer.generate_report()
|
||||
|
||||
print(f"\n📊 Quality Score: {report.overall_score.grade} ({report.overall_score.total_score:.1f}/100)")
|
||||
print(f" Completeness: {report.overall_score.completeness:.1f}%")
|
||||
print(f" Accuracy: {report.overall_score.accuracy:.1f}%")
|
||||
print(f" Coverage: {report.overall_score.coverage:.1f}%")
|
||||
print(f" Health: {report.overall_score.health:.1f}%")
|
||||
EOF
|
||||
|
||||
- name: Package for Claude
|
||||
if: steps.should_update.outputs.update == 'true'
|
||||
run: |
|
||||
SKILL_DIR="output/$FRAMEWORK"
|
||||
|
||||
if [ -d "$SKILL_DIR" ]; then
|
||||
echo "📦 Packaging $FRAMEWORK for Claude AI..."
|
||||
skill-seekers package "$SKILL_DIR" --target claude
|
||||
fi
|
||||
|
||||
- name: Upload updated skill
|
||||
if: steps.should_update.outputs.update == 'true'
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ env.FRAMEWORK }}-skill-updated
|
||||
path: output/${{ env.FRAMEWORK }}.zip
|
||||
retention-days: 90
|
||||
|
||||
summary:
|
||||
name: Update Summary
|
||||
needs: update-skills
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
|
||||
steps:
|
||||
- name: Create summary
|
||||
run: |
|
||||
echo "## 🔄 Scheduled Skills Update" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "**Date:** $(date -u '+%Y-%m-%d %H:%M UTC')" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### Updated Frameworks" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- React" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Django" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- FastAPI" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Godot" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Vue" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Flask" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "Updated skills available in workflow artifacts." >> $GITHUB_STEP_SUMMARY
|
||||
Reference in New Issue
Block a user