Enabling full support of the Claude Code documentation site, with support for all relevant pages and Anthropic's unconventional llms.txt
This commit is contained in:
@@ -1,37 +1,83 @@
|
||||
{
|
||||
"name": "claude-code",
|
||||
"description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, configuration, and AI-assisted development.",
|
||||
"base_url": "https://docs.claude.com/en/docs/claude-code/",
|
||||
"description": "Claude Code CLI and development environment. Use for Claude Code features, tools, workflows, MCP integration, plugins, hooks, configuration, deployment, and AI-assisted development.",
|
||||
"base_url": "https://code.claude.com/docs/en/",
|
||||
"start_urls": [
|
||||
"https://docs.claude.com/en/docs/claude-code/overview",
|
||||
"https://docs.claude.com/en/docs/claude-code/quickstart",
|
||||
"https://docs.claude.com/en/docs/claude-code/common-workflows",
|
||||
"https://docs.claude.com/en/docs/claude-code/mcp",
|
||||
"https://docs.claude.com/en/docs/claude-code/settings",
|
||||
"https://docs.claude.com/en/docs/claude-code/troubleshooting",
|
||||
"https://docs.claude.com/en/docs/claude-code/iam"
|
||||
"https://code.claude.com/docs/en/overview",
|
||||
"https://code.claude.com/docs/en/quickstart",
|
||||
"https://code.claude.com/docs/en/common-workflows",
|
||||
"https://code.claude.com/docs/en/claude-code-on-the-web",
|
||||
"https://code.claude.com/docs/en/desktop",
|
||||
"https://code.claude.com/docs/en/chrome",
|
||||
"https://code.claude.com/docs/en/vs-code",
|
||||
"https://code.claude.com/docs/en/jetbrains",
|
||||
"https://code.claude.com/docs/en/github-actions",
|
||||
"https://code.claude.com/docs/en/gitlab-ci-cd",
|
||||
"https://code.claude.com/docs/en/slack",
|
||||
"https://code.claude.com/docs/en/sub-agents",
|
||||
"https://code.claude.com/docs/en/plugins",
|
||||
"https://code.claude.com/docs/en/discover-plugins",
|
||||
"https://code.claude.com/docs/en/skills",
|
||||
"https://code.claude.com/docs/en/output-styles",
|
||||
"https://code.claude.com/docs/en/hooks-guide",
|
||||
"https://code.claude.com/docs/en/headless",
|
||||
"https://code.claude.com/docs/en/mcp",
|
||||
"https://code.claude.com/docs/en/third-party-integrations",
|
||||
"https://code.claude.com/docs/en/amazon-bedrock",
|
||||
"https://code.claude.com/docs/en/google-vertex-ai",
|
||||
"https://code.claude.com/docs/en/microsoft-foundry",
|
||||
"https://code.claude.com/docs/en/network-config",
|
||||
"https://code.claude.com/docs/en/llm-gateway",
|
||||
"https://code.claude.com/docs/en/devcontainer",
|
||||
"https://code.claude.com/docs/en/sandboxing",
|
||||
"https://code.claude.com/docs/en/setup",
|
||||
"https://code.claude.com/docs/en/iam",
|
||||
"https://code.claude.com/docs/en/security",
|
||||
"https://code.claude.com/docs/en/data-usage",
|
||||
"https://code.claude.com/docs/en/monitoring-usage",
|
||||
"https://code.claude.com/docs/en/costs",
|
||||
"https://code.claude.com/docs/en/analytics",
|
||||
"https://code.claude.com/docs/en/plugin-marketplaces",
|
||||
"https://code.claude.com/docs/en/settings",
|
||||
"https://code.claude.com/docs/en/terminal-config",
|
||||
"https://code.claude.com/docs/en/model-config",
|
||||
"https://code.claude.com/docs/en/memory",
|
||||
"https://code.claude.com/docs/en/statusline",
|
||||
"https://code.claude.com/docs/en/cli-reference",
|
||||
"https://code.claude.com/docs/en/interactive-mode",
|
||||
"https://code.claude.com/docs/en/slash-commands",
|
||||
"https://code.claude.com/docs/en/checkpointing",
|
||||
"https://code.claude.com/docs/en/hooks",
|
||||
"https://code.claude.com/docs/en/plugins-reference",
|
||||
"https://code.claude.com/docs/en/troubleshooting",
|
||||
"https://code.claude.com/docs/en/legal-and-compliance"
|
||||
],
|
||||
"selectors": {
|
||||
"main_content": "#content-container",
|
||||
"main_content": "#content-area, #content-container, article, main",
|
||||
"title": "h1",
|
||||
"code_blocks": "pre code"
|
||||
},
|
||||
"url_patterns": {
|
||||
"include": ["/claude-code/"],
|
||||
"exclude": ["/api-reference/", "/claude-ai/", "/claude.ai/", "/prompt-engineering/", "/changelog/"]
|
||||
"include": ["/docs/en/"],
|
||||
"exclude": [
|
||||
"/docs/fr/", "/docs/de/", "/docs/it/", "/docs/ja/", "/docs/es/",
|
||||
"/docs/ko/", "/docs/zh-CN/", "/docs/zh-TW/", "/docs/ru/",
|
||||
"/docs/id/", "/docs/pt/", "/changelog", "github.com"
|
||||
]
|
||||
},
|
||||
"categories": {
|
||||
"getting_started": ["overview", "quickstart", "installation", "setup", "terminal-config"],
|
||||
"workflows": ["workflow", "common-workflows", "git", "testing", "debugging", "interactive"],
|
||||
"getting_started": ["overview", "quickstart", "common-workflows"],
|
||||
"ide_integrations": ["vs-code", "jetbrains", "desktop", "chrome", "claude-code-on-the-web", "slack"],
|
||||
"ci_cd": ["github-actions", "gitlab-ci-cd"],
|
||||
"building": ["sub-agents", "subagent", "plugins", "discover-plugins", "skills", "output-styles", "hooks-guide", "headless", "programmatic"],
|
||||
"mcp": ["mcp", "model-context-protocol"],
|
||||
"configuration": ["config", "settings", "preferences", "customize", "hooks", "statusline", "model-config", "memory", "output-styles"],
|
||||
"agents": ["agent", "task", "subagent", "sub-agent", "specialized"],
|
||||
"skills": ["skill", "agent-skill"],
|
||||
"integrations": ["ide-integrations", "vs-code", "jetbrains", "plugin", "marketplace"],
|
||||
"deployment": ["bedrock", "vertex", "deployment", "network", "gateway", "devcontainer", "sandboxing", "third-party"],
|
||||
"reference": ["reference", "api", "command", "cli-reference", "slash", "checkpointing", "headless", "sdk"],
|
||||
"enterprise": ["iam", "security", "monitoring", "analytics", "costs", "legal", "data-usage"]
|
||||
"deployment": ["third-party-integrations", "amazon-bedrock", "google-vertex-ai", "microsoft-foundry", "network-config", "llm-gateway", "devcontainer", "sandboxing"],
|
||||
"administration": ["setup", "iam", "security", "data-usage", "monitoring-usage", "costs", "analytics", "plugin-marketplaces"],
|
||||
"configuration": ["settings", "terminal-config", "model-config", "memory", "statusline"],
|
||||
"reference": ["cli-reference", "interactive-mode", "slash-commands", "checkpointing", "hooks", "plugins-reference"],
|
||||
"troubleshooting": ["troubleshooting"],
|
||||
"legal": ["legal-and-compliance"]
|
||||
},
|
||||
"rate_limit": 0.5,
|
||||
"max_pages": 200
|
||||
"max_pages": 250
|
||||
}
|
||||
|
||||
@@ -38,11 +38,24 @@ class LlmsTxtDownloader:
|
||||
|
||||
def _is_markdown(self, content: str) -> bool:
|
||||
"""
|
||||
Check if content looks like markdown.
|
||||
Check if content looks like markdown (not HTML).
|
||||
|
||||
Returns:
|
||||
True if content contains markdown patterns
|
||||
True if content contains markdown patterns and is NOT HTML
|
||||
"""
|
||||
# First, reject HTML content (common redirect trap)
|
||||
content_start = content.strip()[:500].lower()
|
||||
html_indicators = [
|
||||
'<!doctype html',
|
||||
'<html',
|
||||
'<!doctype',
|
||||
'<head>',
|
||||
'<meta charset',
|
||||
]
|
||||
if any(indicator in content_start for indicator in html_indicators):
|
||||
return False
|
||||
|
||||
# Then check for markdown patterns
|
||||
markdown_patterns = ['# ', '## ', '```', '- ', '* ', '`']
|
||||
return any(pattern in content for pattern in markdown_patterns)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user