Fix: include docs references in unified skill output (#213)

* Fix: include docs references in unified skill output * Fix: quality checker counts nested reference files * fix(unified): pass through llms_txt_url and skip_llms_txt to doc scraper * configs: add svelte CLI unified preset (llms.txt + categories) --------- Co-authored-by: Chris Engelhard <chris@chrisengelhard.nl>
2026-01-01 17:40:51 +01:00
parent 98d73611ad
commit 9949cdcdca
4 changed files with 113 additions and 3 deletions
--- a/configs/svelte_cli_unified.json
+++ b/configs/svelte_cli_unified.json
@@ -0,0 +1,68 @@
 {
  "name": "svelte-cli",
  "description": "Svelte CLI: docs (llms.txt) + GitHub repository (commands, project scaffolding, dev/build workflows).",
  "merge_mode": "rule-based",
  "sources": [
    {
      "type": "documentation",
      "base_url": "https://svelte.dev/docs/cli",
      "llms_txt_url": "https://svelte.dev/docs/cli/llms.txt",
      "extract_api": true,
      "selectors": {
        "main_content": "#main, main",
        "title": "h1",
        "code_blocks": "pre code, pre"
      },
      "url_patterns": {
        "include": ["/docs/cli"],
        "exclude": [
          "/docs/kit",
          "/docs/svelte",
          "/docs/mcp",
          "/tutorial",
          "/packages",
          "/playground",
          "/blog"
        ]
      },
      "categories": {
        "overview": ["overview"],
        "faq": ["frequently asked questions"],
        "sv_create": ["sv create"],
        "sv_add": ["sv add"],
        "sv_check": ["sv check"],
        "sv_migrate": ["sv migrate"],
        "devtools_json": ["devtools-json"],
        "drizzle": ["drizzle"],
        "eslint": ["eslint"],
        "lucia": ["lucia"],
        "mcp": ["mcp"],
        "mdsvex": ["mdsvex"],
        "paraglide": ["paraglide"],
        "playwright": ["playwright"],
        "prettier": ["prettier"],
        "storybook": ["storybook"],
        "sveltekit_adapter": ["sveltekit-adapter"],
        "tailwindcss": ["tailwindcss"],
        "vitest": ["vitest"]
      },
      "rate_limit": 0.5,
      "max_pages": 200
    },
    {
      "type": "github",
      "repo": "sveltejs/cli",
      "include_issues": true,
      "max_issues": 150,
      "include_changelog": true,
      "include_releases": true,
      "include_code": true,
      "code_analysis_depth": "deep",
      "file_patterns": [
        "src/**/*.ts",
        "src/**/*.js"
      ],
      "local_repo_path": "local_paths/sveltekit/cli"
    }
  ]
 }
--- a/src/skill_seekers/cli/quality_checker.py
+++ b/src/skill_seekers/cli/quality_checker.py
@@ -146,7 +146,7 @@ class SkillQualityChecker:
                'references/ directory not found - skill may be incomplete',
                str(self.references_dir)
            )
-        elif not list(self.references_dir.glob('*.md')):
+        elif not list(self.references_dir.rglob('*.md')):
            self.report.add_warning(
                'structure',
                'references/ directory is empty - no reference documentation found',
@@ -298,7 +298,7 @@ class SkillQualityChecker:
        # Check reference files
        if self.references_dir.exists():
-            ref_files = list(self.references_dir.glob('*.md'))
+            ref_files = list(self.references_dir.rglob('*.md'))
            if ref_files:
                self.report.add_info(
                    'content',
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -129,6 +129,17 @@ class UnifiedScraper:
            'max_pages': source.get('max_pages', 100)
        }
        # Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
        if 'llms_txt_url' in source:
            doc_config['llms_txt_url'] = source.get('llms_txt_url')
        if 'skip_llms_txt' in source:
            doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
        # Optional: support overriding start URLs
        if 'start_urls' in source:
            doc_config['start_urls'] = source.get('start_urls')
        # Write temporary config
        temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
        with open(temp_config_path, 'w', encoding='utf-8') as f:
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -14,6 +14,7 @@ discrepancies transparently.
 import os
 import json
 import shutil
 import logging
 from pathlib import Path
 from typing import Dict, List, Any, Optional
@@ -286,12 +287,42 @@ This skill combines knowledge from multiple sources:
        docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
        os.makedirs(docs_dir, exist_ok=True)
        # Best-effort: copy docs-only reference files into unified docs references.
        # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates
        # output/{name}_docs/references/*.md. Those are the most useful documentation
        # references for the unified skill.
        source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references')
        copied_files: List[str] = []
        if os.path.isdir(source_refs_dir):
            for entry in sorted(os.listdir(source_refs_dir)):
                src_path = os.path.join(source_refs_dir, entry)
                dst_path = os.path.join(docs_dir, entry)
                if not os.path.isfile(src_path):
                    continue
                shutil.copy2(src_path, dst_path)
                copied_files.append(entry)
        # Create index
        index_path = os.path.join(docs_dir, 'index.md')
-        with open(index_path, 'w') as f:
+        with open(index_path, 'w', encoding='utf-8') as f:
            f.write("# Documentation\n\n")
            f.write("Reference from official documentation.\n\n")
            if copied_files:
                files_no_index = [p for p in copied_files if p.lower() != 'index.md']
                files_index = [p for p in copied_files if p.lower() == 'index.md']
                f.write("## Files\n\n")
                for filename in files_no_index + files_index:
                    f.write(f"- [{filename}]({filename})\n")
            else:
                f.write("## Notes\n\n")
                f.write(
                    "No documentation reference files were copied into this unified skill. "
                    "This usually means the docs-only build did not produce reference files.\n"
                )
        logger.info("Created documentation references")
    def _generate_github_references(self):