diff --git a/.github/workflows/translate-docs.yml b/.github/workflows/translate-docs.yml deleted file mode 100644 index 42a6ea5..0000000 --- a/.github/workflows/translate-docs.yml +++ /dev/null @@ -1,143 +0,0 @@ -name: Translate Documentation to Chinese - -on: - push: - branches: - - main - - development - paths: - - 'docs/**/*.md' - - '!docs/zh-CN/**' - - '!docs/archive/**' - workflow_dispatch: - inputs: - files: - description: 'Specific files to translate (comma-separated, or "all")' - required: false - default: 'changed' - -jobs: - detect-changes: - runs-on: ubuntu-latest - outputs: - changed-files: ${{ steps.detect.outputs.files }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 2 - - - name: Detect changed files - id: detect - run: | - if [ "${{ github.event.inputs.files }}" = "all" ]; then - # Translate all docs - FILES=$(find docs -name "*.md" -not -path "docs/zh-CN/*" -not -path "docs/archive/*" | tr '\n' ',') - elif [ "${{ github.event.inputs.files }}" != "" ] && [ "${{ github.event.inputs.files }}" != "changed" ]; then - # Use provided files - FILES="${{ github.event.inputs.files }}" - else - # Detect changed files - FILES=$(git diff --name-only HEAD~1 HEAD | grep "^docs/" | grep -v "^docs/zh-CN/" | grep -v "^docs/archive/" | grep "\.md$" | tr '\n' ',') - fi - - # Remove trailing comma - FILES=$(echo "$FILES" | sed 's/,$//') - - echo "files=$FILES" >> $GITHUB_OUTPUT - echo "Detected files: $FILES" - - translate: - runs-on: ubuntu-latest - needs: detect-changes - if: needs.detect-changes.outputs.changed-files != '' - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install dependencies - run: | - pip install anthropic - - - name: Translate documents - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - run: | - IFS=',' read -ra FILES <<< "${{ needs.detect-changes.outputs.changed-files }}" - for file in "${FILES[@]}"; do - if [ -f "$file" ]; then - echo "Translating: $file" - python scripts/translate_doc.py "$file" --target-lang zh-CN || echo "Failed: $file" - fi - done - - - name: Check for changes - id: git-check - run: | - git add docs/zh-CN/ - if git diff --cached --quiet; then - echo "changed=false" >> $GITHUB_OUTPUT - else - echo "changed=true" >> $GITHUB_OUTPUT - fi - - - name: Create Pull Request - if: steps.git-check.outputs.changed == 'true' - uses: peter-evans/create-pull-request@v6 - with: - token: ${{ secrets.GITHUB_TOKEN }} - commit-message: "[Auto] Chinese translation update" - title: "🌐 [Auto] Chinese Documentation Translation Update" - body: | - ## 🇨🇳 中文文档翻译更新 / Chinese Documentation Translation Update - - This PR contains automated translations of updated documentation. - - ### 变更内容 / Changes - ${{ needs.detect-changes.outputs.changed-files }} - - ### 审阅指南 / Review Guide - - [ ] 技术术语准确 / Technical terms accurate - - [ ] 链接正确指向中文版本 / Links point to Chinese versions - - [ ] 代码示例保持原样 / Code examples preserved - - [ ] 格式正确 / Formatting correct - - ### 如何审阅 / How to Review - 1. 查看文件列表 / Check the file list - 2. 阅读中文翻译 / Read the Chinese translation - 3. 在 PR 中提出修改建议 / Suggest changes in PR - 4. 确认后批准 / Approve when satisfied - - ### 相关 Issue - - #260 - Chinese Translation - - --- - - *This PR was auto-generated by GitHub Actions* - branch: auto-translate-zh-cn-${{ github.run_number }} - delete-branch: true - labels: translation, zh-CN, needs-review, automated - - - name: Update Issue #260 - if: steps.git-check.outputs.changed == 'true' - uses: actions/github-script@v7 - with: - script: | - github.rest.issues.createComment({ - issue_number: 260, - owner: context.repo.owner, - repo: context.repo.repo, - body: `🤖 **自动翻译更新 / Automated Translation Update** - - 新的中文翻译已准备就绪,需要社区审阅: - - PR: #${{ steps.create-pr.outputs.pull-request-number }} - - 文件: ${{ needs.detect-changes.outputs.changed-files }} - - 请志愿者帮忙审阅,谢谢! - / Community review needed, thanks!` - }) diff --git a/=0.24.0 b/=0.24.0 new file mode 100644 index 0000000..83a1f95 --- /dev/null +++ b/=0.24.0 @@ -0,0 +1,18 @@ +error: externally-managed-environment + +× This environment is externally managed +╰─> To install Python packages system-wide, try 'pacman -S + python-xyz', where xyz is the package you are trying to + install. + + If you wish to install a non-Arch-packaged Python package, + create a virtual environment using 'python -m venv path/to/venv'. + Then use path/to/venv/bin/python and path/to/venv/bin/pip. + + If you wish to install a non-Arch packaged Python application, + it may be easiest to use 'pipx install xyz', which will manage a + virtual environment for you. Make sure you have python-pipx + installed via pacman. + +note: If you believe this is a mistake, please contact your Python installation or OS distribution provider. You can override this, at the risk of breaking your Python installation or OS, by passing --break-system-packages. +hint: See PEP 668 for the detailed specification. diff --git a/CHANGELOG.md b/CHANGELOG.md index 11c4f6c..446f057 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to Skill Seeker will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Changed +- **Explicit chunk flag names** — All `--chunk-*` flags now include unit suffixes to eliminate ambiguity: + - `--chunk-size` (RAG tokens) → `--chunk-tokens` + - `--chunk-overlap` (RAG tokens) → `--chunk-overlap-tokens` + - `--chunk` (enable RAG chunking) → `--chunk-for-rag` + - `--streaming-chunk-size` (chars) → `--streaming-chunk-chars` + - `--streaming-overlap` (chars) → `--streaming-overlap-chars` + - `--chunk-size` in PDF extractor (pages) → `--pdf-pages-per-chunk` +- **`setup_logging()` centralized** — Removed duplicate `logging.basicConfig()` calls in `github_scraper.py`, `codebase_scraper.py`, `unified_scraper.py`; all now use shared `setup_logging()` from `utils.py` + ## [3.1.2] - 2026-02-24 ### 🔧 Fix `create` Command Argument Forwarding, Gemini Model, and Enhance Dispatcher diff --git a/TESTING_GAP_REPORT.md b/TESTING_GAP_REPORT.md new file mode 100644 index 0000000..2277fb0 --- /dev/null +++ b/TESTING_GAP_REPORT.md @@ -0,0 +1,345 @@ +# Comprehensive Testing Gap Report + +**Project:** Skill Seekers v3.1.0 +**Date:** 2026-02-22 +**Total Test Files:** 113 +**Total Test Functions:** ~208+ (collected: 2173 tests) + +--- + +## Executive Summary + +### Overall Test Health: 🟡 GOOD with Gaps + +| Category | Status | Coverage | Key Gaps | +|----------|--------|----------|----------| +| CLI Arguments | ✅ Good | 85% | Some edge cases | +| Workflow System | ✅ Excellent | 90% | Inline stage parsing edge cases | +| Scrapers | 🟡 Moderate | 70% | Missing real HTTP/PDF tests | +| Enhancement | 🟡 Partial | 60% | Core logic not tested | +| MCP Tools | 🟡 Good | 75% | 8 tools not covered | +| Integration/E2E | 🟡 Moderate | 65% | Heavy mocking | +| Adaptors | ✅ Good | 80% | Good coverage per platform | + +--- + +## Detailed Findings by Category + +### 1. CLI Argument Tests ✅ GOOD + +**Files Reviewed:** +- `test_analyze_command.py` (269 lines, 26 tests) +- `test_unified.py` - TestUnifiedCLIArguments class (6 tests) +- `test_pdf_scraper.py` - TestPDFCLIArguments class (4 tests) +- `test_create_arguments.py` (399 lines) +- `test_create_integration_basic.py` (310 lines, 23 tests) + +**Strengths:** +- All new workflow flags are tested (`--enhance-workflow`, `--enhance-stage`, `--var`, `--workflow-dry-run`) +- Argument parsing thoroughly tested +- Default values verified +- Complex command combinations tested + +**Gaps:** +- `test_create_integration_basic.py`: 2 tests skipped (source auto-detection not fully tested) +- No tests for invalid argument combinations beyond basic parsing errors + +--- + +### 2. Workflow Tests ✅ EXCELLENT + +**Files Reviewed:** +- `test_workflow_runner.py` (445 lines, 30+ tests) +- `test_workflows_command.py` (571 lines, 40+ tests) +- `test_workflow_tools_mcp.py` (295 lines, 20+ tests) + +**Strengths:** +- Comprehensive workflow execution tests +- Variable substitution thoroughly tested +- Dry-run mode tested +- Workflow chaining tested +- All 6 workflow subcommands tested (list, show, copy, add, remove, validate) +- MCP workflow tools tested + +**Minor Gaps:** +- No tests for `_build_inline_engine` edge cases +- No tests for malformed stage specs (empty, invalid format) + +--- + +### 3. Scraper Tests 🟡 MODERATE with Significant Gaps + +**Files Reviewed:** +- `test_scraper_features.py` (524 lines) - Doc scraper features +- `test_codebase_scraper.py` (478 lines) - Codebase analysis +- `test_pdf_scraper.py` (558 lines) - PDF scraper +- `test_github_scraper.py` (1015 lines) - GitHub scraper +- `test_unified_analyzer.py` (428 lines) - Unified analyzer + +**Critical Gaps:** + +#### A. Missing Real External Resource Tests +| Resource | Test Type | Status | +|----------|-----------|--------| +| HTTP Requests (docs) | Mocked only | ❌ Gap | +| PDF Extraction | Mocked only | ❌ Gap | +| GitHub API | Mocked only | ❌ Gap (acceptable) | +| Local Files | Real tests | ✅ Good | + +#### B. Missing Core Function Tests +| Function | Location | Priority | +|----------|----------|----------| +| `UnifiedScraper.run()` | unified_scraper.py | 🔴 High | +| `UnifiedScraper._scrape_documentation()` | unified_scraper.py | 🔴 High | +| `UnifiedScraper._scrape_github()` | unified_scraper.py | 🔴 High | +| `UnifiedScraper._scrape_pdf()` | unified_scraper.py | 🔴 High | +| `UnifiedScraper._scrape_local()` | unified_scraper.py | 🟡 Medium | +| `DocToSkillConverter.scrape()` | doc_scraper.py | 🔴 High | +| `PDFToSkillConverter.extract_pdf()` | pdf_scraper.py | 🔴 High | + +#### C. PDF Scraper Limited Coverage +- No actual PDF parsing tests (only mocked) +- OCR functionality not tested +- Page range extraction not tested + +--- + +### 4. Enhancement Tests 🟡 PARTIAL - MAJOR GAPS + +**Files Reviewed:** +- `test_enhance_command.py` (367 lines, 25+ tests) +- `test_enhance_skill_local.py` (163 lines, 14 tests) + +**Critical Gap in `test_enhance_skill_local.py`:** + +| Function | Lines | Tested? | Priority | +|----------|-------|---------|----------| +| `summarize_reference()` | ~50 | ❌ No | 🔴 High | +| `create_enhancement_prompt()` | ~200 | ❌ No | 🔴 High | +| `run()` | ~100 | ❌ No | 🔴 High | +| `_run_headless()` | ~130 | ❌ No | 🔴 High | +| `_run_background()` | ~80 | ❌ No | 🟡 Medium | +| `_run_daemon()` | ~60 | ❌ No | 🟡 Medium | +| `write_status()` | ~30 | ❌ No | 🟡 Medium | +| `read_status()` | ~40 | ❌ No | 🟡 Medium | +| `detect_terminal_app()` | ~80 | ❌ No | 🟡 Medium | + +**Current Tests Only Cover:** +- Agent presets configuration +- Command building +- Agent name normalization +- Environment variable handling + +**Recommendation:** Add comprehensive tests for the core enhancement logic. + +--- + +### 5. MCP Tool Tests 🟡 GOOD with Coverage Gaps + +**Files Reviewed:** +- `test_mcp_fastmcp.py` (868 lines) +- `test_mcp_server.py` (715 lines) +- `test_mcp_vector_dbs.py` (259 lines) +- `test_real_world_fastmcp.py` (558 lines) + +**Coverage Analysis:** + +| Tool Category | Tools | Tested | Coverage | +|---------------|-------|--------|----------| +| Config Tools | 3 | 3 | ✅ 100% | +| Scraping Tools | 8 | 4 | 🟡 50% | +| Packaging Tools | 4 | 4 | ✅ 100% | +| Splitting Tools | 2 | 2 | ✅ 100% | +| Source Tools | 5 | 5 | ✅ 100% | +| Vector DB Tools | 4 | 4 | ✅ 100% | +| Workflow Tools | 5 | 0 | ❌ 0% | +| **Total** | **31** | **22** | **🟡 71%** | + +**Untested Tools:** +1. `detect_patterns` +2. `extract_test_examples` +3. `build_how_to_guides` +4. `extract_config_patterns` +5. `list_workflows` +6. `get_workflow` +7. `create_workflow` +8. `update_workflow` +9. `delete_workflow` + +**Note:** `test_mcp_server.py` tests legacy server, `test_mcp_fastmcp.py` tests modern server. + +--- + +### 6. Integration/E2E Tests 🟡 MODERATE + +**Files Reviewed:** +- `test_create_integration_basic.py` (310 lines) +- `test_e2e_three_stream_pipeline.py` (598 lines) +- `test_analyze_e2e.py` (344 lines) +- `test_install_skill_e2e.py` (533 lines) +- `test_c3_integration.py` (362 lines) + +**Issues Found:** + +1. **Skipped Tests:** + - `test_create_detects_web_url` - Source auto-detection incomplete + - `test_create_invalid_source_shows_error` - Error handling incomplete + - `test_cli_via_unified_command` - Asyncio issues + +2. **Heavy Mocking:** + - Most GitHub API tests use mocking + - No real HTTP tests for doc scraping + - Integration tests don't test actual integration + +3. **Limited Scope:** + - Only `--quick` preset tested (not `--comprehensive`) + - C3.x tests use mock data only + - Most E2E tests are unit tests with mocks + +--- + +### 7. Adaptor Tests ✅ GOOD + +**Files Reviewed:** +- `test_adaptors/test_adaptors_e2e.py` (893 lines) +- `test_adaptors/test_claude_adaptor.py` (314 lines) +- `test_adaptors/test_gemini_adaptor.py` (146 lines) +- `test_adaptors/test_openai_adaptor.py` (188 lines) +- Plus 8 more platform adaptors + +**Strengths:** +- Each adaptor has dedicated tests +- Package format testing +- Upload success/failure scenarios +- Platform-specific features tested + +**Minor Gaps:** +- Some adaptors only test 1-2 scenarios +- Error handling coverage varies by platform + +--- + +### 8. Config/Validation Tests ✅ GOOD + +**Files Reviewed:** +- `test_config_validation.py` (270 lines) +- `test_config_extractor.py` (629 lines) +- `test_config_fetcher.py` (340 lines) + +**Strengths:** +- Unified vs legacy format detection +- Field validation comprehensive +- Error message quality tested + +--- + +## Summary of Critical Testing Gaps + +### 🔴 HIGH PRIORITY (Must Fix) + +1. **Enhancement Core Logic** + - File: `test_enhance_skill_local.py` + - Missing: 9 major functions + - Impact: Core feature untested + +2. **Unified Scraper Main Flow** + - File: New tests needed + - Missing: `_scrape_*()` methods, `run()` orchestration + - Impact: Multi-source scraping untested + +3. **Actual HTTP/PDF/GitHub Integration** + - Missing: Real external resource tests + - Impact: Only mock tests exist + +### 🟡 MEDIUM PRIORITY (Should Fix) + +4. **MCP Workflow Tools** + - Missing: 5 workflow tools (0% coverage) + - Impact: MCP workflow features untested + +5. **Skipped Integration Tests** + - 3 tests skipped + - Impact: Source auto-detection incomplete + +6. **PDF Real Extraction** + - Missing: Actual PDF parsing + - Impact: PDF feature quality unknown + +### 🟢 LOW PRIORITY (Nice to Have) + +7. **Additional Scraping Tools** + - Missing: 4 scraping tool tests + - Impact: Low (core tools covered) + +8. **Edge Case Coverage** + - Missing: Invalid argument combinations + - Impact: Low (happy path covered) + +--- + +## Recommendations + +### Immediate Actions (Next Sprint) + +1. **Add Enhancement Logic Tests** (~400 lines) + - Test `summarize_reference()` + - Test `create_enhancement_prompt()` + - Test `run()` method + - Test status read/write + +2. **Fix Skipped Tests** (~100 lines) + - Fix asyncio issues in `test_cli_via_unified_command` + - Complete source auto-detection tests + +3. **Add MCP Workflow Tool Tests** (~200 lines) + - Test all 5 workflow tools + +### Short Term (Next Month) + +4. **Add Unified Scraper Integration Tests** (~300 lines) + - Test main orchestration flow + - Test individual source scraping + +5. **Add Real PDF Tests** (~150 lines) + - Test with actual PDF files + - Test OCR if available + +### Long Term (Next Quarter) + +6. **HTTP Integration Tests** (~200 lines) + - Test with real websites (use test sites) + - Mock server approach + +7. **Complete E2E Pipeline** (~300 lines) + - Full workflow from scrape to upload + - Real GitHub repo (fork test repo) + +--- + +## Test Quality Metrics + +| Metric | Score | Notes | +|--------|-------|-------| +| Test Count | 🟢 Good | 2173+ tests | +| Coverage | 🟡 Moderate | ~75% estimated | +| Real Tests | 🟡 Moderate | Many mocked | +| Documentation | 🟢 Good | Most tests documented | +| Maintenance | 🟢 Good | Tests recently updated | + +--- + +## Conclusion + +The Skill Seekers test suite is **comprehensive in quantity** (2173+ tests) but has **quality gaps** in critical areas: + +1. **Core enhancement logic** is largely untested +2. **Multi-source scraping** orchestration lacks integration tests +3. **MCP workflow tools** have zero coverage +4. **Real external resource** testing is minimal + +**Priority:** Fix the 🔴 HIGH priority gaps first, as they impact core functionality. + +--- + +*Report generated: 2026-02-22* +*Reviewer: Systematic test review with parallel subagent analysis* diff --git a/docs/archive/legacy/QUICK_REFERENCE.md b/docs/archive/legacy/QUICK_REFERENCE.md index 7004be7..67ca817 100644 --- a/docs/archive/legacy/QUICK_REFERENCE.md +++ b/docs/archive/legacy/QUICK_REFERENCE.md @@ -71,7 +71,7 @@ skill-seekers pdf manual.pdf --name product-manual skill-seekers pdf scanned.pdf --enable-ocr # Large PDF (chunked processing) -skill-seekers pdf large.pdf --chunk-size 50 +skill-seekers pdf large.pdf --pdf-pages-per-chunk 50 ``` ### Multi-Source Scraping diff --git a/docs/archive/research/PDF_IMAGE_EXTRACTION.md b/docs/archive/research/PDF_IMAGE_EXTRACTION.md index 9d17186..26b58e8 100644 --- a/docs/archive/research/PDF_IMAGE_EXTRACTION.md +++ b/docs/archive/research/PDF_IMAGE_EXTRACTION.md @@ -122,7 +122,7 @@ python3 cli/pdf_extractor_poc.py documentation.pdf \ --extract-images \ --min-image-size 150 \ --min-quality 6.0 \ - --chunk-size 20 \ + --pdf-pages-per-chunk 20 \ --output documentation.json \ --verbose \ --pretty @@ -477,7 +477,7 @@ python3 cli/pdf_extractor_poc.py manual.pdf \ --image-dir assets/images/ \ --min-image-size 200 \ --min-quality 7.0 \ - --chunk-size 15 \ + --pdf-pages-per-chunk 15 \ --output manual.json \ --verbose \ --pretty diff --git a/docs/features/PDF_CHUNKING.md b/docs/features/PDF_CHUNKING.md index 1ff8a48..93ca392 100644 --- a/docs/features/PDF_CHUNKING.md +++ b/docs/features/PDF_CHUNKING.md @@ -25,10 +25,10 @@ Break large PDFs into smaller, manageable chunks: python3 cli/pdf_extractor_poc.py input.pdf # Custom chunk size (20 pages per chunk) -python3 cli/pdf_extractor_poc.py input.pdf --chunk-size 20 +python3 cli/pdf_extractor_poc.py input.pdf --pdf-pages-per-chunk 20 # Disable chunking (single chunk with all pages) -python3 cli/pdf_extractor_poc.py input.pdf --chunk-size 0 +python3 cli/pdf_extractor_poc.py input.pdf --pdf-pages-per-chunk 0 ``` ### ✅ 2. Chapter/Section Detection @@ -272,7 +272,7 @@ cat manual.json | jq '.total_chunks' ```bash # Large PDF with bigger chunks (50 pages each) -python3 cli/pdf_extractor_poc.py large_manual.pdf --chunk-size 50 -o output.json -v +python3 cli/pdf_extractor_poc.py large_manual.pdf --pdf-pages-per-chunk 50 -o output.json -v # Verbose output shows: # 📦 Creating chunks (chunk_size=50)... @@ -286,7 +286,7 @@ python3 cli/pdf_extractor_poc.py large_manual.pdf --chunk-size 50 -o output.json ```bash # Process all pages as single chunk -python3 cli/pdf_extractor_poc.py small_doc.pdf --chunk-size 0 -o output.json +python3 cli/pdf_extractor_poc.py small_doc.pdf --pdf-pages-per-chunk 0 -o output.json ``` --- @@ -369,7 +369,7 @@ Create a test PDF with chapters: 3. Page 30: "Chapter 3: API Reference" ```bash -python3 cli/pdf_extractor_poc.py test.pdf -o test.json --chunk-size 20 -v +python3 cli/pdf_extractor_poc.py test.pdf -o test.json --pdf-pages-per-chunk 20 -v # Verify chapters detected cat test.json | jq '.chapters' @@ -441,7 +441,7 @@ The chunking feature lays groundwork for: **Example workflow:** ```bash # Extract large manual with chapters -python3 cli/pdf_extractor_poc.py large_manual.pdf --chunk-size 25 -o manual.json +python3 cli/pdf_extractor_poc.py large_manual.pdf --pdf-pages-per-chunk 25 -o manual.json # Future: Build skill from chunks python3 cli/build_skill_from_pdf.py manual.json diff --git a/docs/integrations/CHROMA.md b/docs/integrations/CHROMA.md index 7dc6238..b016bff 100644 --- a/docs/integrations/CHROMA.md +++ b/docs/integrations/CHROMA.md @@ -223,7 +223,7 @@ skill-seekers package output/codebase --target langchain **Option D: RAG-Optimized Chunking** ```bash -skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 +skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 skill-seekers package output/fastapi --target langchain ``` @@ -968,7 +968,7 @@ collection.add( 2. **Implement Semantic Chunking:** ```bash - skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 + skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 ``` 3. **Set Up Multi-Collection Search:** diff --git a/docs/integrations/FAISS.md b/docs/integrations/FAISS.md index e9324b7..658f0a4 100644 --- a/docs/integrations/FAISS.md +++ b/docs/integrations/FAISS.md @@ -255,7 +255,7 @@ skill-seekers package output/codebase --target langchain **Option D: RAG-Optimized Chunking** ```bash -skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 +skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 skill-seekers package output/fastapi --target langchain ``` diff --git a/docs/integrations/HAYSTACK.md b/docs/integrations/HAYSTACK.md index 47a087c..9f8ddcb 100644 --- a/docs/integrations/HAYSTACK.md +++ b/docs/integrations/HAYSTACK.md @@ -318,8 +318,8 @@ print(response["llm"]["replies"][0]) # Enable semantic chunking (preserves code blocks, respects paragraphs) skill-seekers scrape --config configs/django.json \ --chunk-for-rag \ - --chunk-size 512 \ - --chunk-overlap 50 + --chunk-tokens 512 \ + --chunk-overlap-tokens 50 # Package chunked output skill-seekers package output/django --target haystack @@ -439,8 +439,8 @@ python scripts/merge_documents.py \ # Enable chunking for frameworks with long pages skill-seekers scrape --config configs/django.json \ --chunk-for-rag \ - --chunk-size 512 \ - --chunk-overlap 50 + --chunk-tokens 512 \ + --chunk-overlap-tokens 50 ``` ### 2. Choose Right Document Store @@ -506,8 +506,8 @@ Complete example of building a FastAPI documentation chatbot: # Scrape FastAPI docs with chunking skill-seekers scrape --config configs/fastapi.json \ --chunk-for-rag \ - --chunk-size 512 \ - --chunk-overlap 50 \ + --chunk-tokens 512 \ + --chunk-overlap-tokens 50 \ --max-pages 200 # Package for Haystack @@ -698,8 +698,8 @@ skill-seekers scrape --config configs/fastapi.json --chunk-for-rag # 2. Adjust chunk size skill-seekers scrape --config configs/fastapi.json \ --chunk-for-rag \ - --chunk-size 768 \ # Larger chunks for more context - --chunk-overlap 100 # More overlap for continuity + --chunk-tokens 768 \ # Larger chunks for more context + --chunk-overlap-tokens 100 # More overlap for continuity # 3. Use hybrid search (BM25 + embeddings) # See Advanced Usage section diff --git a/docs/integrations/QDRANT.md b/docs/integrations/QDRANT.md index 9dec5e3..beaf8d7 100644 --- a/docs/integrations/QDRANT.md +++ b/docs/integrations/QDRANT.md @@ -270,7 +270,7 @@ skill-seekers package output/codebase --target langchain **Option D: RAG-Optimized Chunking** ```bash -skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 +skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 skill-seekers package output/fastapi --target langchain ``` diff --git a/docs/integrations/WEAVIATE.md b/docs/integrations/WEAVIATE.md index dd17a0e..1648ba5 100644 --- a/docs/integrations/WEAVIATE.md +++ b/docs/integrations/WEAVIATE.md @@ -210,7 +210,7 @@ skill-seekers package output/codebase --target langchain **Option D: RAG-Optimized Chunking** ```bash -skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 +skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 skill-seekers package output/fastapi --target langchain ``` @@ -960,7 +960,7 @@ print(schema.get("multiTenancyConfig", {}).get("enabled")) # Should be True 2. **Implement Semantic Chunking:** ```bash - skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-size 512 + skill-seekers scrape --config configs/fastapi.json --chunk-for-rag --chunk-tokens 512 ``` 3. **Set Up Multi-Tenancy:** diff --git a/docs/reference/CLI_REFERENCE.md b/docs/reference/CLI_REFERENCE.md index 7fac99b..f5be01f 100644 --- a/docs/reference/CLI_REFERENCE.md +++ b/docs/reference/CLI_REFERENCE.md @@ -252,8 +252,8 @@ skill-seekers create [source] [options] | | `--workflow-dry-run` | | Preview workflow without executing | | | `--dry-run` | | Preview without creating | | | `--chunk-for-rag` | | Enable RAG chunking | -| | `--chunk-size` | 512 | Chunk size in tokens | -| | `--chunk-overlap` | 50 | Chunk overlap in tokens | +| | `--chunk-tokens` | 512 | Chunk size in tokens | +| | `--chunk-overlap-tokens` | 50 | Chunk overlap in tokens | | | `--help-web` | | Show web scraping options | | | `--help-github` | | Show GitHub options | | | `--help-local` | | Show local analysis options | @@ -615,10 +615,10 @@ skill-seekers package SKILL_DIRECTORY [options] | | `--skip-quality-check` | | Skip quality checks | | | `--upload` | | Auto-upload after packaging | | | `--streaming` | | Streaming mode for large docs | -| | `--chunk-size` | 4000 | Max chars per chunk (streaming) | -| | `--chunk-overlap` | 200 | Overlap between chunks | +| | `--streaming-chunk-chars` | 4000 | Max chars per chunk (streaming) | +| | `--streaming-overlap-chars` | 200 | Overlap between chunks (chars) | | | `--batch-size` | 100 | Chunks per batch | -| | `--chunk` | | Enable RAG chunking | +| | `--chunk-for-rag` | | Enable RAG chunking | | | `--chunk-tokens` | 512 | Max tokens per chunk | | | `--no-preserve-code` | | Allow code block splitting | @@ -877,7 +877,7 @@ skill-seekers stream --config CONFIG [options] | Short | Long | Description | |-------|------|-------------| | `-c` | `--config` | Config JSON file | -| | `--chunk-size` | Size of each chunk | +| | `--streaming-chunk-chars` | Maximum characters per chunk (default: 4000) | | | `--output` | Output directory | **Examples:** @@ -887,7 +887,7 @@ skill-seekers stream --config CONFIG [options] skill-seekers stream --config configs/large-docs.json # Custom chunk size -skill-seekers stream --config configs/large-docs.json --chunk-size 1000 +skill-seekers stream --config configs/large-docs.json --streaming-chunk-chars 1000 ``` --- diff --git a/docs/strategy/ACTION_PLAN.md b/docs/strategy/ACTION_PLAN.md index 1be4d2f..e49a903 100644 --- a/docs/strategy/ACTION_PLAN.md +++ b/docs/strategy/ACTION_PLAN.md @@ -365,8 +365,8 @@ Position Skill Seekers as **the universal documentation preprocessor** for the e 2. **Implement Chunking for RAG** (8-12 hours) ```bash skill-seekers scrape --chunk-for-rag \ - --chunk-size 512 \ - --chunk-overlap 50 \ + --chunk-tokens 512 \ + --chunk-overlap-tokens 50 \ --preserve-code-blocks ``` diff --git a/docs/strategy/KIMI_ANALYSIS_COMPARISON.md b/docs/strategy/KIMI_ANALYSIS_COMPARISON.md index 5d43714..2cefce8 100644 --- a/docs/strategy/KIMI_ANALYSIS_COMPARISON.md +++ b/docs/strategy/KIMI_ANALYSIS_COMPARISON.md @@ -139,8 +139,8 @@ skill-seekers scrape --format confluence # Confluence storage format ```bash # New flag for embedding-optimized chunking skill-seekers scrape --chunk-for-rag \ - --chunk-size 512 \ - --chunk-overlap 50 \ + --chunk-tokens 512 \ + --chunk-overlap-tokens 50 \ --add-metadata # Output: chunks with metadata for embedding diff --git a/docs/user-guide/02-scraping.md b/docs/user-guide/02-scraping.md index d54a016..63e448a 100644 --- a/docs/user-guide/02-scraping.md +++ b/docs/user-guide/02-scraping.md @@ -385,7 +385,7 @@ skill-seekers create --max-pages 100 skill-seekers create --streaming # Or smaller chunks -skill-seekers create --chunk-size 500 +skill-seekers create --chunk-tokens 500 ``` --- diff --git a/docs/user-guide/04-packaging.md b/docs/user-guide/04-packaging.md index 847453c..cced71a 100644 --- a/docs/user-guide/04-packaging.md +++ b/docs/user-guide/04-packaging.md @@ -158,8 +158,8 @@ skill-seekers package output/large-skill/ --streaming # Custom chunk size skill-seekers package output/large-skill/ \ --streaming \ - --chunk-size 2000 \ - --chunk-overlap 100 + --streaming-chunk-chars 2000 \ + --streaming-overlap-chars 100 ``` **When to use:** @@ -177,23 +177,23 @@ Optimize for Retrieval-Augmented Generation: # Enable semantic chunking skill-seekers package output/my-skill/ \ --target langchain \ - --chunk \ + --chunk-for-rag \ --chunk-tokens 512 # Custom chunk size skill-seekers package output/my-skill/ \ --target chroma \ --chunk-tokens 256 \ - --chunk-overlap 50 + --chunk-overlap-tokens 50 ``` **Chunking Options:** | Option | Default | Description | |--------|---------|-------------| -| `--chunk` | auto | Enable chunking | +| `--chunk-for-rag` | auto | Enable chunking | | `--chunk-tokens` | 512 | Tokens per chunk | -| `--chunk-overlap` | 50 | Overlap between chunks | +| `--chunk-overlap-tokens` | 50 | Overlap between chunks (tokens) | | `--no-preserve-code` | - | Allow splitting code blocks | --- @@ -449,7 +449,7 @@ skill-seekers upload output/my-skill-claude.zip --target claude skill-seekers package output/my-skill/ --streaming # Smaller chunks -skill-seekers package output/my-skill/ --streaming --chunk-size 1000 +skill-seekers package output/my-skill/ --streaming --streaming-chunk-chars 1000 ``` --- diff --git a/docs/user-guide/06-troubleshooting.md b/docs/user-guide/06-troubleshooting.md index 00d01b0..88042f3 100644 --- a/docs/user-guide/06-troubleshooting.md +++ b/docs/user-guide/06-troubleshooting.md @@ -295,7 +295,7 @@ skill-seekers package output/my-skill/ --streaming # Reduce chunk size skill-seekers package output/my-skill/ \ --streaming \ - --chunk-size 1000 + --streaming-chunk-chars 1000 ``` --- diff --git a/docs/zh-CN/reference/CLI_REFERENCE.md b/docs/zh-CN/reference/CLI_REFERENCE.md index 5b64ba8..88ffbc0 100644 --- a/docs/zh-CN/reference/CLI_REFERENCE.md +++ b/docs/zh-CN/reference/CLI_REFERENCE.md @@ -237,8 +237,8 @@ skill-seekers create [source] [options] | | `--workflow-dry-run` | | Preview workflow without executing | | | `--dry-run` | | Preview without creating | | | `--chunk-for-rag` | | Enable RAG chunking | -| | `--chunk-size` | 512 | Chunk size in tokens | -| | `--chunk-overlap` | 50 | Chunk overlap in tokens | +| | `--chunk-tokens` | 512 | Chunk size in tokens | +| | `--chunk-overlap-tokens` | 50 | Chunk overlap in tokens | | | `--help-web` | | Show web scraping options | | | `--help-github` | | Show GitHub options | | | `--help-local` | | Show local analysis options | @@ -593,10 +593,10 @@ skill-seekers package SKILL_DIRECTORY [options] | | `--skip-quality-check` | | Skip quality checks | | | `--upload` | | Auto-upload after packaging | | | `--streaming` | | Streaming mode for large docs | -| | `--chunk-size` | 4000 | Max chars per chunk (streaming) | -| | `--chunk-overlap` | 200 | Overlap between chunks | +| | `--streaming-chunk-chars` | 4000 | Max chars per chunk (streaming) | +| | `--streaming-overlap-chars` | 200 | Overlap between chunks (chars) | | | `--batch-size` | 100 | Chunks per batch | -| | `--chunk` | | Enable RAG chunking | +| | `--chunk-for-rag` | | Enable RAG chunking | | | `--chunk-tokens` | 512 | Max tokens per chunk | | | `--no-preserve-code` | | Allow code block splitting | @@ -847,7 +847,7 @@ skill-seekers stream --config CONFIG [options] | Short | Long | Description | |-------|------|-------------| | `-c` | `--config` | Config JSON file | -| | `--chunk-size` | Size of each chunk | +| | `--streaming-chunk-chars` | Maximum characters per chunk (default: 4000) | | | `--output` | Output directory | **Examples:** @@ -857,7 +857,7 @@ skill-seekers stream --config CONFIG [options] skill-seekers stream --config configs/large-docs.json # Custom chunk size -skill-seekers stream --config configs/large-docs.json --chunk-size 1000 +skill-seekers stream --config configs/large-docs.json --streaming-chunk-chars 1000 ``` --- diff --git a/docs/zh-CN/user-guide/02-scraping.md b/docs/zh-CN/user-guide/02-scraping.md index d54a016..63e448a 100644 --- a/docs/zh-CN/user-guide/02-scraping.md +++ b/docs/zh-CN/user-guide/02-scraping.md @@ -385,7 +385,7 @@ skill-seekers create --max-pages 100 skill-seekers create --streaming # Or smaller chunks -skill-seekers create --chunk-size 500 +skill-seekers create --chunk-tokens 500 ``` --- diff --git a/docs/zh-CN/user-guide/04-packaging.md b/docs/zh-CN/user-guide/04-packaging.md index 847453c..cced71a 100644 --- a/docs/zh-CN/user-guide/04-packaging.md +++ b/docs/zh-CN/user-guide/04-packaging.md @@ -158,8 +158,8 @@ skill-seekers package output/large-skill/ --streaming # Custom chunk size skill-seekers package output/large-skill/ \ --streaming \ - --chunk-size 2000 \ - --chunk-overlap 100 + --streaming-chunk-chars 2000 \ + --streaming-overlap-chars 100 ``` **When to use:** @@ -177,23 +177,23 @@ Optimize for Retrieval-Augmented Generation: # Enable semantic chunking skill-seekers package output/my-skill/ \ --target langchain \ - --chunk \ + --chunk-for-rag \ --chunk-tokens 512 # Custom chunk size skill-seekers package output/my-skill/ \ --target chroma \ --chunk-tokens 256 \ - --chunk-overlap 50 + --chunk-overlap-tokens 50 ``` **Chunking Options:** | Option | Default | Description | |--------|---------|-------------| -| `--chunk` | auto | Enable chunking | +| `--chunk-for-rag` | auto | Enable chunking | | `--chunk-tokens` | 512 | Tokens per chunk | -| `--chunk-overlap` | 50 | Overlap between chunks | +| `--chunk-overlap-tokens` | 50 | Overlap between chunks (tokens) | | `--no-preserve-code` | - | Allow splitting code blocks | --- @@ -449,7 +449,7 @@ skill-seekers upload output/my-skill-claude.zip --target claude skill-seekers package output/my-skill/ --streaming # Smaller chunks -skill-seekers package output/my-skill/ --streaming --chunk-size 1000 +skill-seekers package output/my-skill/ --streaming --streaming-chunk-chars 1000 ``` --- diff --git a/docs/zh-CN/user-guide/06-troubleshooting.md b/docs/zh-CN/user-guide/06-troubleshooting.md index 00d01b0..88042f3 100644 --- a/docs/zh-CN/user-guide/06-troubleshooting.md +++ b/docs/zh-CN/user-guide/06-troubleshooting.md @@ -295,7 +295,7 @@ skill-seekers package output/my-skill/ --streaming # Reduce chunk size skill-seekers package output/my-skill/ \ --streaming \ - --chunk-size 1000 + --streaming-chunk-chars 1000 ``` --- diff --git a/examples/haystack-pipeline/README.md b/examples/haystack-pipeline/README.md index d40ba08..070a7a1 100644 --- a/examples/haystack-pipeline/README.md +++ b/examples/haystack-pipeline/README.md @@ -132,7 +132,7 @@ For better retrieval quality, use semantic chunking: ```bash # Generate with chunking -skill-seekers scrape --config configs/react.json --max-pages 100 --chunk-for-rag --chunk-size 512 --chunk-overlap 50 +skill-seekers scrape --config configs/react.json --max-pages 100 --chunk-for-rag --chunk-tokens 512 --chunk-overlap-tokens 50 # Use chunked output python quickstart.py --chunked diff --git a/src/skill_seekers/workflows/comparison-matrix.yaml b/src/skill_seekers/workflows/comparison-matrix.yaml index 669452f..d87dabe 100644 --- a/src/skill_seekers/workflows/comparison-matrix.yaml +++ b/src/skill_seekers/workflows/comparison-matrix.yaml @@ -6,7 +6,6 @@ applies_to: - doc_scraping variables: depth: comprehensive - alternatives: [] stages: - name: feature_comparison type: custom diff --git a/src/skill_seekers/workflows/data-validation.yaml b/src/skill_seekers/workflows/data-validation.yaml index 198ebff..fa31efb 100644 --- a/src/skill_seekers/workflows/data-validation.yaml +++ b/src/skill_seekers/workflows/data-validation.yaml @@ -164,5 +164,5 @@ post_process: add_metadata: enhanced: true workflow: data-validation - domain: ml + domain: backend has_validation_docs: true diff --git a/src/skill_seekers/workflows/default.yaml b/src/skill_seekers/workflows/default.yaml index f5329d3..7992352 100644 --- a/src/skill_seekers/workflows/default.yaml +++ b/src/skill_seekers/workflows/default.yaml @@ -17,6 +17,46 @@ stages: target: examples enabled: true uses_history: false + - name: architecture_overview + type: custom + target: architecture + uses_history: false + enabled: true + prompt: > + Provide a concise architectural overview of this codebase. + + Cover: + 1. Overall architecture style (MVC, microservices, layered, etc.) + 2. Key components and their responsibilities + 3. Data flow between components + 4. External dependencies and integrations + 5. Entry points (CLI, API, web, etc.) + + Output JSON with: + - "architecture_style": main architectural pattern + - "components": array of {name, responsibility} + - "data_flow": how data moves through the system + - "external_deps": third-party services and libraries + - "entry_points": how users interact with the system + - name: skill_polish + type: custom + target: skill_md + uses_history: true + enabled: true + prompt: > + Review the SKILL.md content generated so far and improve it. + + Fix: + 1. Unclear or overly technical descriptions + 2. Missing quick-start examples + 3. Gaps in the overview section + 4. Redundant or duplicate information + 5. Formatting inconsistencies + + Output JSON with: + - "improved_overview": rewritten overview section + - "quick_start": concise getting-started snippet + - "key_concepts": 3-5 essential concepts a developer needs to know post_process: reorder_sections: [] add_metadata: diff --git a/src/skill_seekers/workflows/minimal.yaml b/src/skill_seekers/workflows/minimal.yaml index a8f4a23..b4f5919 100644 --- a/src/skill_seekers/workflows/minimal.yaml +++ b/src/skill_seekers/workflows/minimal.yaml @@ -14,12 +14,17 @@ stages: uses_history: false enabled: true prompt: > - Review the following SKILL.md content and make minimal improvements: - - Fix obvious formatting issues - - Ensure the overview section is clear and concise - - Remove duplicate or redundant information + Review the SKILL.md content and make minimal targeted improvements. - Return the improved content as plain text without extra commentary. + Fix only: + 1. Obvious formatting issues (broken lists, inconsistent headers) + 2. Unclear overview section (make it one clear paragraph) + 3. Duplicate or redundant information (remove repeats) + + Output JSON with: + - "improved_overview": rewritten overview paragraph (plain markdown) + - "removed_sections": list of section names that were removed as duplicates + - "formatting_fixes": list of specific formatting issues corrected post_process: reorder_sections: [] add_metadata: diff --git a/src/skill_seekers/workflows/security-focus.yaml b/src/skill_seekers/workflows/security-focus.yaml index c2f7923..c6d2e80 100644 --- a/src/skill_seekers/workflows/security-focus.yaml +++ b/src/skill_seekers/workflows/security-focus.yaml @@ -3,9 +3,7 @@ description: "Security-focused review: vulnerabilities, auth, data handling" version: "1.0" applies_to: - codebase_analysis - - python - - javascript - - typescript + - github_analysis variables: depth: comprehensive stages: diff --git a/uv.lock b/uv.lock index b16a7ef..ca708f7 100644 --- a/uv.lock +++ b/uv.lock @@ -5204,7 +5204,7 @@ wheels = [ [[package]] name = "skill-seekers" -version = "3.1.1" +version = "3.1.2" source = { editable = "." } dependencies = [ { name = "anthropic" },