From 9042e1680c8080543be7dc509592094a335a1a91 Mon Sep 17 00:00:00 2001 From: Nick Miethe Date: Thu, 8 Jan 2026 15:33:12 -0500 Subject: [PATCH] Enabling full support of the Claude Code documentation site, with support for all relevant pages and Anthropic's unconventional llms.txt --- configs/claude-code.json | 90 +++++++++++++++----- src/skill_seekers/cli/llms_txt_downloader.py | 17 +++- 2 files changed, 83 insertions(+), 24 deletions(-) diff --git a/configs/claude-code.json b/configs/claude-code.json index c84e709..ee96f68 100644 --- a/configs/claude-code.json +++ b/configs/claude-code.json @@ -1,37 +1,83 @@ { "name": "claude-code", - "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.", - "base_url": "https://docs.claude.com/en/docs/claude-code/", + "description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.", + "base_url": "https://code.claude.com/docs/en/", "start_urls": [ - "https://docs.claude.com/en/docs/claude-code/overview", - "https://docs.claude.com/en/docs/claude-code/quickstart", - "https://docs.claude.com/en/docs/claude-code/common-workflows", - "https://docs.claude.com/en/docs/claude-code/mcp", - "https://docs.claude.com/en/docs/claude-code/settings", - "https://docs.claude.com/en/docs/claude-code/troubleshooting", - "https://docs.claude.com/en/docs/claude-code/iam" + "https://code.claude.com/docs/en/overview", + "https://code.claude.com/docs/en/quickstart", + "https://code.claude.com/docs/en/common-workflows", + "https://code.claude.com/docs/en/claude-code-on-the-web", + "https://code.claude.com/docs/en/desktop", + "https://code.claude.com/docs/en/chrome", + "https://code.claude.com/docs/en/vs-code", + "https://code.claude.com/docs/en/jetbrains", + "https://code.claude.com/docs/en/github-actions", + "https://code.claude.com/docs/en/gitlab-ci-cd", + "https://code.claude.com/docs/en/slack", + "https://code.claude.com/docs/en/sub-agents", + "https://code.claude.com/docs/en/plugins", + "https://code.claude.com/docs/en/discover-plugins", + "https://code.claude.com/docs/en/skills", + "https://code.claude.com/docs/en/output-styles", + "https://code.claude.com/docs/en/hooks-guide", + "https://code.claude.com/docs/en/headless", + "https://code.claude.com/docs/en/mcp", + "https://code.claude.com/docs/en/third-party-integrations", + "https://code.claude.com/docs/en/amazon-bedrock", + "https://code.claude.com/docs/en/google-vertex-ai", + "https://code.claude.com/docs/en/microsoft-foundry", + "https://code.claude.com/docs/en/network-config", + "https://code.claude.com/docs/en/llm-gateway", + "https://code.claude.com/docs/en/devcontainer", + "https://code.claude.com/docs/en/sandboxing", + "https://code.claude.com/docs/en/setup", + "https://code.claude.com/docs/en/iam", + "https://code.claude.com/docs/en/security", + "https://code.claude.com/docs/en/data-usage", + "https://code.claude.com/docs/en/monitoring-usage", + "https://code.claude.com/docs/en/costs", + "https://code.claude.com/docs/en/analytics", + "https://code.claude.com/docs/en/plugin-marketplaces", + "https://code.claude.com/docs/en/settings", + "https://code.claude.com/docs/en/terminal-config", + "https://code.claude.com/docs/en/model-config", + "https://code.claude.com/docs/en/memory", + "https://code.claude.com/docs/en/statusline", + "https://code.claude.com/docs/en/cli-reference", + "https://code.claude.com/docs/en/interactive-mode", + "https://code.claude.com/docs/en/slash-commands", + "https://code.claude.com/docs/en/checkpointing", + "https://code.claude.com/docs/en/hooks", + "https://code.claude.com/docs/en/plugins-reference", + "https://code.claude.com/docs/en/troubleshooting", + "https://code.claude.com/docs/en/legal-and-compliance" ], "selectors": { - "main_content": "#content-container", + "main_content": "#content-area, #content-container, article, main", "title": "h1", "code_blocks": "pre code" }, "url_patterns": { - "include": ["/claude-code/"], - "exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"] + "include": ["/docs/en/"], + "exclude": [ + "/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/", + "/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/", + "/docs/id/", "/docs/pt/", "/changelog", "github.com" + ] }, "categories": { - "getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"], - "workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"], + "getting_started": ["overview", "quickstart", "common-workflows"], + "ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"], + "ci_cd": ["github-actions", "gitlab-ci-cd"], + "building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"], "mcp": ["mcp", "model-context-protocol"], - "configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"], - "agents": ["agent", "task", "subagent", "sub-agent", "specialized"], - "skills": ["skill", "agent-skill"], - "integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"], - "deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"], - "reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"], - "enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"] + "deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"], + "administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"], + "configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"], + "reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"], + "troubleshooting": ["troubleshooting"], + "legal": ["legal-and-compliance"] }, "rate_limit": 0.5, - "max_pages": 200 + "max_pages": 250 } diff --git a/src/skill_seekers/cli/llms_txt_downloader.py b/src/skill_seekers/cli/llms_txt_downloader.py index 1049f86..76ec740 100644 --- a/src/skill_seekers/cli/llms_txt_downloader.py +++ b/src/skill_seekers/cli/llms_txt_downloader.py @@ -38,11 +38,24 @@ class LlmsTxtDownloader: def _is_markdown(self, content: str) -> bool: """ - Check if content looks like markdown. + Check if content looks like markdown (not HTML). Returns: - True if content contains markdown patterns + True if content contains markdown patterns and is NOT HTML """ + # First, reject HTML content (common redirect trap) + content_start = content.strip()[:500].lower() + html_indicators = [ + '', + '