From 9949cdcdca6ca7dd198093cda816db9096c8f724 Mon Sep 17 00:00:00 2001 From: Chris Engelhard <34413331+digi4care@users.noreply.github.com> Date: Thu, 1 Jan 2026 17:40:51 +0100 Subject: [PATCH] Fix: include docs references in unified skill output (#213) * Fix: include docs references in unified skill output * Fix: quality checker counts nested reference files * fix(unified): pass through llms_txt_url and skip_llms_txt to doc scraper * configs: add svelte CLI unified preset (llms.txt + categories) --------- Co-authored-by: Chris Engelhard --- configs/svelte_cli_unified.json | 68 +++++++++++++++++++ src/skill_seekers/cli/quality_checker.py | 4 +- src/skill_seekers/cli/unified_scraper.py | 11 +++ .../cli/unified_skill_builder.py | 33 ++++++++- 4 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 configs/svelte_cli_unified.json diff --git a/configs/svelte_cli_unified.json b/configs/svelte_cli_unified.json new file mode 100644 index 0000000..d1ad6bd --- /dev/null +++ b/configs/svelte_cli_unified.json @@ -0,0 +1,68 @@ +{ + "name": "svelte-cli", + "description": "Svelte CLI: docs (llms.txt) + GitHub repository (commands, project scaffolding, dev/build workflows).", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://svelte.dev/docs/cli", + "llms_txt_url": "https://svelte.dev/docs/cli/llms.txt", + "extract_api": true, + "selectors": { + "main_content": "#main, main", + "title": "h1", + "code_blocks": "pre code, pre" + }, + "url_patterns": { + "include": ["/docs/cli"], + "exclude": [ + "/docs/kit", + "/docs/svelte", + "/docs/mcp", + "/tutorial", + "/packages", + "/playground", + "/blog" + ] + }, + "categories": { + "overview": ["overview"], + "faq": ["frequently asked questions"], + "sv_create": ["sv create"], + "sv_add": ["sv add"], + "sv_check": ["sv check"], + "sv_migrate": ["sv migrate"], + "devtools_json": ["devtools-json"], + "drizzle": ["drizzle"], + "eslint": ["eslint"], + "lucia": ["lucia"], + "mcp": ["mcp"], + "mdsvex": ["mdsvex"], + "paraglide": ["paraglide"], + "playwright": ["playwright"], + "prettier": ["prettier"], + "storybook": ["storybook"], + "sveltekit_adapter": ["sveltekit-adapter"], + "tailwindcss": ["tailwindcss"], + "vitest": ["vitest"] + }, + "rate_limit": 0.5, + "max_pages": 200 + }, + { + "type": "github", + "repo": "sveltejs/cli", + "include_issues": true, + "max_issues": 150, + "include_changelog": true, + "include_releases": true, + "include_code": true, + "code_analysis_depth": "deep", + "file_patterns": [ + "src/**/*.ts", + "src/**/*.js" + ], + "local_repo_path": "local_paths/sveltekit/cli" + } + ] +} diff --git a/src/skill_seekers/cli/quality_checker.py b/src/skill_seekers/cli/quality_checker.py index 8ff66c5..e9bc9cf 100644 --- a/src/skill_seekers/cli/quality_checker.py +++ b/src/skill_seekers/cli/quality_checker.py @@ -146,7 +146,7 @@ class SkillQualityChecker: 'references/ directory not found - skill may be incomplete', str(self.references_dir) ) - elif not list(self.references_dir.glob('*.md')): + elif not list(self.references_dir.rglob('*.md')): self.report.add_warning( 'structure', 'references/ directory is empty - no reference documentation found', @@ -298,7 +298,7 @@ class SkillQualityChecker: # Check reference files if self.references_dir.exists(): - ref_files = list(self.references_dir.glob('*.md')) + ref_files = list(self.references_dir.rglob('*.md')) if ref_files: self.report.add_info( 'content', diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index 78bec51..f8b5dcf 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -129,6 +129,17 @@ class UnifiedScraper: 'max_pages': source.get('max_pages', 100) } + # Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs) + if 'llms_txt_url' in source: + doc_config['llms_txt_url'] = source.get('llms_txt_url') + + if 'skip_llms_txt' in source: + doc_config['skip_llms_txt'] = source.get('skip_llms_txt') + + # Optional: support overriding start URLs + if 'start_urls' in source: + doc_config['start_urls'] = source.get('start_urls') + # Write temporary config temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json') with open(temp_config_path, 'w', encoding='utf-8') as f: diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py index dd3051d..b8f9700 100644 --- a/src/skill_seekers/cli/unified_skill_builder.py +++ b/src/skill_seekers/cli/unified_skill_builder.py @@ -14,6 +14,7 @@ discrepancies transparently. import os import json +import shutil import logging from pathlib import Path from typing import Dict, List, Any, Optional @@ -286,12 +287,42 @@ This skill combines knowledge from multiple sources: docs_dir = os.path.join(self.skill_dir, 'references', 'documentation') os.makedirs(docs_dir, exist_ok=True) + # Best-effort: copy docs-only reference files into unified docs references. + # UnifiedScraper runs doc_scraper using name "{name}_docs", which creates + # output/{name}_docs/references/*.md. Those are the most useful documentation + # references for the unified skill. + source_refs_dir = os.path.join('output', f"{self.name}_docs", 'references') + copied_files: List[str] = [] + + if os.path.isdir(source_refs_dir): + for entry in sorted(os.listdir(source_refs_dir)): + src_path = os.path.join(source_refs_dir, entry) + dst_path = os.path.join(docs_dir, entry) + if not os.path.isfile(src_path): + continue + shutil.copy2(src_path, dst_path) + copied_files.append(entry) + # Create index index_path = os.path.join(docs_dir, 'index.md') - with open(index_path, 'w') as f: + with open(index_path, 'w', encoding='utf-8') as f: f.write("# Documentation\n\n") f.write("Reference from official documentation.\n\n") + if copied_files: + files_no_index = [p for p in copied_files if p.lower() != 'index.md'] + files_index = [p for p in copied_files if p.lower() == 'index.md'] + + f.write("## Files\n\n") + for filename in files_no_index + files_index: + f.write(f"- [{filename}]({filename})\n") + else: + f.write("## Notes\n\n") + f.write( + "No documentation reference files were copied into this unified skill. " + "This usually means the docs-only build did not produce reference files.\n" + ) + logger.info("Created documentation references") def _generate_github_references(self):