feat(C3.9): Add project documentation extraction from markdown files

- Scan ALL .md files in project (README, docs/, etc.) - Smart categorization by folder/filename (overview, architecture, guides, etc.) - Processing depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced - AI enhancement at level 2+ adds topic extraction and cross-references - New "Project Documentation" section in SKILL.md with summaries - Output to references/documentation/ organized by category - Default ON, use --skip-docs to disable - Add skip_docs parameter to MCP scrape_codebase_tool - Add 15 new tests for markdown documentation features Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 13:54:56 +03:00
parent 4cfb94e14f
commit 170dd0fd75
6 changed files with 845 additions and 4 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -297,9 +297,17 @@ skill-seekers analyze --directory . --skip-patterns --skip-how-to-guides
 ```

 - Generates 300+ line standalone SKILL.md files from codebases
- All C3.x features integrated (patterns, tests, guides, config, architecture)
+- All C3.x features integrated (patterns, tests, guides, config, architecture, docs)
 - Complete codebase analysis without documentation scraping

+**C3.9 Project Documentation Extraction** (`codebase_scraper.py`):
+- Extracts and categorizes all markdown files from the project
+- Auto-detects categories: overview, architecture, guides, workflows, features, etc.
+- Integrates documentation into SKILL.md with summaries
+- AI enhancement (level 2+) adds topic extraction and cross-references
+- Controlled by depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced
+- Default ON, use `--skip-docs` to disable
+
 **Key Architecture Decision (BREAKING in v2.5.2):**
 - Changed from opt-in (`--build-*`) to opt-out (`--skip-*`) flags
 - All analysis features now ON by default for maximum value
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -75,6 +75,53 @@ LANGUAGE_EXTENSIONS = {
    ".php": "PHP",
 }

+# Markdown extension mapping
+MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
+
+# Common documentation folders to scan
+DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
+
+# Root-level doc files → category mapping
+ROOT_DOC_CATEGORIES = {
+    "readme": "overview",
+    "contributing": "contributing",
+    "changelog": "changelog",
+    "history": "changelog",
+    "license": "license",
+    "authors": "authors",
+    "code_of_conduct": "community",
+    "security": "security",
+    "architecture": "architecture",
+    "design": "architecture",
+}
+
+# Folder name → category mapping
+FOLDER_CATEGORIES = {
+    "architecture": "architecture",
+    "arch": "architecture",
+    "design": "architecture",
+    "guides": "guides",
+    "guide": "guides",
+    "tutorials": "guides",
+    "tutorial": "guides",
+    "howto": "guides",
+    "how-to": "guides",
+    "workflows": "workflows",
+    "workflow": "workflows",
+    "templates": "templates",
+    "template": "templates",
+    "api": "api",
+    "reference": "api",
+    "examples": "examples",
+    "example": "examples",
+    "specs": "specifications",
+    "spec": "specifications",
+    "rfcs": "specifications",
+    "rfc": "specifications",
+    "features": "features",
+    "feature": "features",
+}
+
 # Default directories to exclude
 DEFAULT_EXCLUDED_DIRS = {
    "node_modules",
@@ -216,6 +263,469 @@ def walk_directory(
    return sorted(files)


+def walk_markdown_files(
+    root: Path,
+    gitignore_spec: pathspec.PathSpec | None = None,
+    excluded_dirs: set | None = None,
+) -> list[Path]:
+    """
+    Walk directory tree and collect markdown documentation files.
+
+    Args:
+        root: Root directory to walk
+        gitignore_spec: Optional PathSpec object for .gitignore rules
+        excluded_dirs: Set of directory names to exclude
+
+    Returns:
+        List of markdown file paths
+    """
+    if excluded_dirs is None:
+        excluded_dirs = DEFAULT_EXCLUDED_DIRS
+
+    files = []
+    root = Path(root).resolve()
+
+    for dirpath, dirnames, filenames in os.walk(root):
+        current_dir = Path(dirpath)
+
+        # Filter out excluded directories (in-place modification)
+        dirnames[:] = [d for d in dirnames if not should_exclude_dir(d, excluded_dirs)]
+
+        for filename in filenames:
+            file_path = current_dir / filename
+
+            # Check .gitignore rules
+            if gitignore_spec:
+                try:
+                    rel_path = file_path.relative_to(root)
+                    if gitignore_spec.match_file(str(rel_path)):
+                        logger.debug(f"Skipping (gitignore): {rel_path}")
+                        continue
+                except ValueError:
+                    continue
+
+            # Check if markdown file
+            if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS:
+                continue
+
+            files.append(file_path)
+
+    return sorted(files)
+
+
+def categorize_markdown_file(file_path: Path, root: Path) -> str:
+    """
+    Categorize a markdown file based on its location and filename.
+
+    Args:
+        file_path: Path to the markdown file
+        root: Root directory of the project
+
+    Returns:
+        Category name (e.g., 'overview', 'guides', 'architecture')
+    """
+    try:
+        rel_path = file_path.relative_to(root)
+    except ValueError:
+        return "other"
+
+    # Check root-level files by filename
+    if len(rel_path.parts) == 1:
+        filename_lower = file_path.stem.lower().replace("-", "_").replace(" ", "_")
+        for key, category in ROOT_DOC_CATEGORIES.items():
+            if key in filename_lower:
+                return category
+        return "overview"  # Default for root .md files
+
+    # Check folder-based categorization
+    for part in rel_path.parts[:-1]:  # Exclude filename
+        part_lower = part.lower().replace("-", "_").replace(" ", "_")
+        for key, category in FOLDER_CATEGORIES.items():
+            if key in part_lower:
+                return category
+
+    # Default category
+    return "other"
+
+
+def extract_markdown_structure(content: str) -> dict[str, Any]:
+    """
+    Extract structure from markdown content (headers, code blocks, links).
+
+    Args:
+        content: Markdown file content
+
+    Returns:
+        Dictionary with extracted structure
+    """
+    import re
+
+    structure = {
+        "title": None,
+        "headers": [],
+        "code_blocks": [],
+        "links": [],
+        "word_count": len(content.split()),
+        "line_count": len(content.split("\n")),
+    }
+
+    lines = content.split("\n")
+
+    # Extract headers
+    for i, line in enumerate(lines):
+        header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
+        if header_match:
+            level = len(header_match.group(1))
+            text = header_match.group(2).strip()
+            structure["headers"].append({
+                "level": level,
+                "text": text,
+                "line": i + 1,
+            })
+            # First h1 is the title
+            if level == 1 and structure["title"] is None:
+                structure["title"] = text
+
+    # Extract code blocks (fenced)
+    code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
+    for match in code_block_pattern.finditer(content):
+        language = match.group(1) or "text"
+        code = match.group(2).strip()
+        if len(code) > 0:
+            structure["code_blocks"].append({
+                "language": language,
+                "code": code[:500],  # Truncate long code blocks
+                "full_length": len(code),
+            })
+
+    # Extract links
+    link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
+    for match in link_pattern.finditer(content):
+        structure["links"].append({
+            "text": match.group(1),
+            "url": match.group(2),
+        })
+
+    return structure
+
+
+def generate_markdown_summary(content: str, structure: dict[str, Any], max_length: int = 500) -> str:
+    """
+    Generate a summary of markdown content.
+
+    Args:
+        content: Full markdown content
+        structure: Extracted structure from extract_markdown_structure()
+        max_length: Maximum summary length
+
+    Returns:
+        Summary string
+    """
+    # Start with title if available
+    summary_parts = []
+
+    if structure.get("title"):
+        summary_parts.append(f"**{structure['title']}**")
+
+    # Add header outline (first 5 h2/h3 headers)
+    h2_h3 = [h for h in structure.get("headers", []) if h["level"] in (2, 3)][:5]
+    if h2_h3:
+        sections = [h["text"] for h in h2_h3]
+        summary_parts.append(f"Sections: {', '.join(sections)}")
+
+    # Extract first paragraph (skip headers and empty lines)
+    lines = content.split("\n")
+    first_para = []
+    in_para = False
+    for line in lines:
+        stripped = line.strip()
+        if stripped.startswith("#") or stripped.startswith("```"):
+            if in_para:
+                break
+            continue
+        if stripped:
+            in_para = True
+            first_para.append(stripped)
+        elif in_para:
+            break
+
+    if first_para:
+        para_text = " ".join(first_para)
+        if len(para_text) > 200:
+            para_text = para_text[:200] + "..."
+        summary_parts.append(para_text)
+
+    # Add stats
+    stats = f"({structure.get('word_count', 0)} words, {len(structure.get('code_blocks', []))} code blocks)"
+    summary_parts.append(stats)
+
+    summary = "\n".join(summary_parts)
+    if len(summary) > max_length:
+        summary = summary[:max_length] + "..."
+
+    return summary
+
+
+def process_markdown_docs(
+    directory: Path,
+    output_dir: Path,
+    depth: str = "deep",
+    gitignore_spec: pathspec.PathSpec | None = None,
+    enhance_with_ai: bool = False,
+    ai_mode: str = "none",
+) -> dict[str, Any]:
+    """
+    Process all markdown documentation files in a directory.
+
+    Args:
+        directory: Root directory to scan
+        output_dir: Output directory for processed docs
+        depth: Processing depth ('surface', 'deep', 'full')
+        gitignore_spec: Optional .gitignore spec
+        enhance_with_ai: Whether to use AI enhancement
+        ai_mode: AI mode ('none', 'auto', 'api', 'local')
+
+    Returns:
+        Dictionary with processed documentation data
+    """
+    logger.info("Scanning for markdown documentation...")
+
+    # Find all markdown files
+    md_files = walk_markdown_files(directory, gitignore_spec)
+    logger.info(f"Found {len(md_files)} markdown files")
+
+    if not md_files:
+        return {"files": [], "categories": {}, "total_files": 0}
+
+    # Process each file
+    processed_docs = []
+    categories = {}
+
+    for md_path in md_files:
+        try:
+            content = md_path.read_text(encoding="utf-8", errors="ignore")
+            rel_path = str(md_path.relative_to(directory))
+            category = categorize_markdown_file(md_path, directory)
+
+            doc_data = {
+                "path": rel_path,
+                "filename": md_path.name,
+                "category": category,
+                "size_bytes": len(content.encode("utf-8")),
+            }
+
+            # Surface depth: just path and category
+            if depth == "surface":
+                processed_docs.append(doc_data)
+            else:
+                # Deep/Full: extract structure and summary
+                structure = extract_markdown_structure(content)
+                summary = generate_markdown_summary(content, structure)
+
+                doc_data.update({
+                    "title": structure.get("title") or md_path.stem,
+                    "structure": structure,
+                    "summary": summary,
+                    "content": content if depth == "full" else None,
+                })
+                processed_docs.append(doc_data)
+
+            # Track categories
+            if category not in categories:
+                categories[category] = []
+            categories[category].append(rel_path)
+
+        except Exception as e:
+            logger.warning(f"Failed to process {md_path}: {e}")
+            continue
+
+    # AI Enhancement (if enabled and enhance_level >= 2)
+    if enhance_with_ai and ai_mode != "none" and processed_docs:
+        logger.info("🤖 Enhancing documentation analysis with AI...")
+        try:
+            processed_docs = _enhance_docs_with_ai(processed_docs, ai_mode)
+            logger.info("✅ AI documentation enhancement complete")
+        except Exception as e:
+            logger.warning(f"⚠️  AI enhancement failed: {e}")
+
+    # Save processed docs to output
+    docs_output_dir = output_dir / "documentation"
+    docs_output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Copy files organized by category
+    for doc in processed_docs:
+        try:
+            src_path = directory / doc["path"]
+            category = doc["category"]
+            category_dir = docs_output_dir / category
+            category_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy file to category folder
+            dest_path = category_dir / doc["filename"]
+            import shutil
+            shutil.copy2(src_path, dest_path)
+        except Exception as e:
+            logger.debug(f"Failed to copy {doc['path']}: {e}")
+
+    # Save documentation index
+    index_data = {
+        "total_files": len(processed_docs),
+        "categories": categories,
+        "files": processed_docs,
+    }
+
+    index_json = docs_output_dir / "documentation_index.json"
+    with open(index_json, "w", encoding="utf-8") as f:
+        json.dump(index_data, f, indent=2, default=str)
+
+    logger.info(f"✅ Processed {len(processed_docs)} documentation files in {len(categories)} categories")
+    logger.info(f"📁 Saved to: {docs_output_dir}")
+
+    return index_data
+
+
+def _enhance_docs_with_ai(docs: list[dict], ai_mode: str) -> list[dict]:
+    """
+    Enhance documentation analysis with AI.
+
+    Args:
+        docs: List of processed document dictionaries
+        ai_mode: AI mode ('api' or 'local')
+
+    Returns:
+        Enhanced document list
+    """
+    # Try API mode first
+    if ai_mode in ("api", "auto"):
+        api_key = os.environ.get("ANTHROPIC_API_KEY")
+        if api_key:
+            return _enhance_docs_api(docs, api_key)
+
+    # Fall back to LOCAL mode
+    if ai_mode in ("local", "auto"):
+        return _enhance_docs_local(docs)
+
+    return docs
+
+
+def _enhance_docs_api(docs: list[dict], api_key: str) -> list[dict]:
+    """Enhance docs using Claude API."""
+    try:
+        import anthropic
+        client = anthropic.Anthropic(api_key=api_key)
+
+        # Batch documents for efficiency
+        batch_size = 10
+        for i in range(0, len(docs), batch_size):
+            batch = docs[i:i + batch_size]
+
+            # Create prompt for batch
+            docs_text = "\n\n".join([
+                f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nSummary: {d.get('summary', 'N/A')}"
+                for d in batch if d.get("summary")
+            ])
+
+            if not docs_text:
+                continue
+
+            prompt = f"""Analyze these documentation files and provide:
+1. A brief description of what each document covers
+2. Key topics/concepts mentioned
+3. How they relate to each other
+
+Documents:
+{docs_text}
+
+Return JSON with format:
+{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": [...], "related_to": [...]}}]}}"""
+
+            response = client.messages.create(
+                model="claude-sonnet-4-20250514",
+                max_tokens=2000,
+                messages=[{"role": "user", "content": prompt}]
+            )
+
+            # Parse response and merge enhancements
+            try:
+                import re
+                json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
+                if json_match:
+                    enhancements = json.loads(json_match.group())
+                    for enh in enhancements.get("enhancements", []):
+                        for doc in batch:
+                            if doc["filename"] == enh.get("filename"):
+                                doc["ai_description"] = enh.get("description")
+                                doc["ai_topics"] = enh.get("key_topics", [])
+                                doc["ai_related"] = enh.get("related_to", [])
+            except Exception:
+                pass
+
+    except Exception as e:
+        logger.warning(f"API enhancement failed: {e}")
+
+    return docs
+
+
+def _enhance_docs_local(docs: list[dict]) -> list[dict]:
+    """Enhance docs using Claude Code CLI (LOCAL mode)."""
+    import subprocess
+    import tempfile
+
+    # Prepare batch of docs for enhancement
+    docs_with_summary = [d for d in docs if d.get("summary")]
+    if not docs_with_summary:
+        return docs
+
+    docs_text = "\n\n".join([
+        f"## {d.get('title', d['filename'])}\nCategory: {d['category']}\nPath: {d['path']}\nSummary: {d.get('summary', 'N/A')}"
+        for d in docs_with_summary[:20]  # Limit to 20 docs
+    ])
+
+    prompt = f"""Analyze these documentation files from a codebase and provide insights.
+
+For each document, provide:
+1. A brief description of what it covers
+2. Key topics/concepts
+3. Related documents
+
+Documents:
+{docs_text}
+
+Output JSON only:
+{{"enhancements": [{{"filename": "...", "description": "...", "key_topics": ["..."], "related_to": ["..."]}}]}}"""
+
+    try:
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
+            f.write(prompt)
+            prompt_file = f.name
+
+        result = subprocess.run(
+            ["claude", "--dangerously-skip-permissions", "-p", prompt],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+
+        os.unlink(prompt_file)
+
+        if result.returncode == 0 and result.stdout:
+            import re
+            json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
+            if json_match:
+                enhancements = json.loads(json_match.group())
+                for enh in enhancements.get("enhancements", []):
+                    for doc in docs:
+                        if doc["filename"] == enh.get("filename"):
+                            doc["ai_description"] = enh.get("description")
+                            doc["ai_topics"] = enh.get("key_topics", [])
+                            doc["ai_related"] = enh.get("related_to", [])
+
+    except Exception as e:
+        logger.warning(f"LOCAL enhancement failed: {e}")
+
+    return docs
+
+
 def analyze_codebase(
    directory: Path,
    output_dir: Path,
@@ -229,6 +739,7 @@ def analyze_codebase(
    extract_test_examples: bool = True,
    build_how_to_guides: bool = True,
    extract_config_patterns: bool = True,
+    extract_docs: bool = True,
    enhance_level: int = 0,
 ) -> dict[str, Any]:
    """
@@ -247,7 +758,8 @@ def analyze_codebase(
        extract_test_examples: Extract usage examples from test files
        build_how_to_guides: Build how-to guides from workflow examples (C3.3)
        extract_config_patterns: Extract configuration patterns from config files (C3.4)
-        enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch, 3=full)
+        extract_docs: Extract and process markdown documentation files (default: True)
+        enhance_level: AI enhancement level (0=off, 1=SKILL.md only, 2=+config+arch+docs, 3=full)

    Returns:
        Analysis results dictionary
@@ -622,6 +1134,33 @@ def analyze_codebase(
    else:
        logger.info("No clear architectural patterns detected")

+    # Extract markdown documentation (C3.9)
+    docs_data = None
+    if extract_docs:
+        logger.info("Extracting project documentation...")
+        try:
+            # Determine AI enhancement for docs (level 2+)
+            enhance_docs_ai = enhance_level >= 2
+            docs_data = process_markdown_docs(
+                directory=directory,
+                output_dir=output_dir,
+                depth=depth,
+                gitignore_spec=gitignore_spec,
+                enhance_with_ai=enhance_docs_ai,
+                ai_mode=ai_mode,
+            )
+
+            if docs_data and docs_data.get("total_files", 0) > 0:
+                logger.info(
+                    f"✅ Extracted {docs_data['total_files']} documentation files "
+                    f"in {len(docs_data.get('categories', {}))} categories"
+                )
+            else:
+                logger.info("No markdown documentation files found")
+        except Exception as e:
+            logger.warning(f"Documentation extraction failed: {e}")
+            docs_data = None
+
    # Generate SKILL.md and references/ directory
    logger.info("Generating SKILL.md and references...")
    _generate_skill_md(
@@ -634,6 +1173,8 @@ def analyze_codebase(
        detect_patterns=detect_patterns,
        extract_test_examples=extract_test_examples,
        extract_config_patterns=extract_config_patterns,
+        extract_docs=extract_docs,
+        docs_data=docs_data,
    )

    return results
@@ -649,6 +1190,8 @@ def _generate_skill_md(
    detect_patterns: bool,
    extract_test_examples: bool,
    extract_config_patterns: bool,
+    extract_docs: bool = True,
+    docs_data: dict[str, Any] | None = None,
 ):
    """
    Generate rich SKILL.md from codebase analysis results.
@@ -728,7 +1271,10 @@ Use this skill when you need to:
        skill_content += "- ✅ Test Examples (C3.2)\n"
    if extract_config_patterns:
        skill_content += "- ✅ Configuration Patterns (C3.4)\n"
-    skill_content += "- ✅ Architectural Analysis (C3.7)\n\n"
+    skill_content += "- ✅ Architectural Analysis (C3.7)\n"
+    if extract_docs:
+        skill_content += "- ✅ Project Documentation (C3.9)\n"
+    skill_content += "\n"

    # Add design patterns if available
    if detect_patterns:
@@ -759,6 +1305,12 @@ Use this skill when you need to:
        if config_content:
            skill_content += config_content

+    # Add project documentation if available
+    if extract_docs and docs_data:
+        docs_content = _format_documentation_section(output_dir, docs_data)
+        if docs_content:
+            skill_content += docs_content
+
    # Available references
    skill_content += "## 📚 Available References\n\n"
    skill_content += "This skill includes detailed reference documentation:\n\n"
@@ -788,6 +1340,9 @@ Use this skill when you need to:
    if (output_dir / "architecture").exists():
        skill_content += "- **Architecture**: `references/architecture/` - Architectural patterns\n"
        refs_added = True
+    if extract_docs and (output_dir / "documentation").exists():
+        skill_content += "- **Documentation**: `references/documentation/` - Project documentation\n"
+        refs_added = True

    if not refs_added:
        skill_content += "No additional references generated (analysis features disabled).\n"
@@ -1017,6 +1572,75 @@ def _format_config_section(output_dir: Path) -> str:
    return content


+def _format_documentation_section(output_dir: Path, docs_data: dict[str, Any]) -> str:
+    """Format project documentation section from extracted markdown files."""
+    if not docs_data or docs_data.get("total_files", 0) == 0:
+        return ""
+
+    categories = docs_data.get("categories", {})
+    files = docs_data.get("files", [])
+
+    content = "## 📖 Project Documentation\n\n"
+    content += "*Extracted from markdown files in the project (C3.9)*\n\n"
+    content += f"**Total Documentation Files:** {docs_data['total_files']}\n"
+    content += f"**Categories:** {len(categories)}\n\n"
+
+    # List documents by category (most important first)
+    priority_order = ["overview", "architecture", "guides", "workflows", "features", "api", "examples"]
+
+    # Sort categories by priority
+    sorted_categories = []
+    for cat in priority_order:
+        if cat in categories:
+            sorted_categories.append(cat)
+    for cat in sorted(categories.keys()):
+        if cat not in sorted_categories:
+            sorted_categories.append(cat)
+
+    for category in sorted_categories[:6]:  # Limit to 6 categories in SKILL.md
+        cat_files = categories[category]
+        content += f"### {category.title()}\n\n"
+
+        # Get file details for this category
+        cat_docs = [f for f in files if f.get("category") == category]
+
+        for doc in cat_docs[:5]:  # Limit to 5 docs per category
+            title = doc.get("title") or doc.get("filename", "Unknown")
+            path = doc.get("path", "")
+
+            # Add summary if available (deep/full depth)
+            if doc.get("ai_description"):
+                content += f"- **{title}**: {doc['ai_description']}\n"
+            elif doc.get("summary"):
+                # Extract first sentence from summary
+                summary = doc["summary"].split("\n")[0]
+                if len(summary) > 100:
+                    summary = summary[:100] + "..."
+                content += f"- **{title}**: {summary}\n"
+            else:
+                content += f"- **{title}** (`{path}`)\n"
+
+        if len(cat_files) > 5:
+            content += f"- *...and {len(cat_files) - 5} more*\n"
+
+        content += "\n"
+
+    # AI-enhanced topics if available
+    all_topics = []
+    for doc in files:
+        all_topics.extend(doc.get("ai_topics", []))
+
+    if all_topics:
+        # Deduplicate and count
+        from collections import Counter
+        topic_counts = Counter(all_topics)
+        top_topics = [t for t, _ in topic_counts.most_common(10)]
+        content += f"**Key Topics:** {', '.join(top_topics)}\n\n"
+
+    content += "*See `references/documentation/` for all project documentation*\n\n"
+    return content
+
+
 def _generate_references(output_dir: Path):
    """
    Generate references/ directory structure by symlinking analysis output.
@@ -1035,6 +1659,7 @@ def _generate_references(output_dir: Path):
        "tutorials": "tutorials",
        "config_patterns": "config_patterns",
        "architecture": "architecture",
+        "documentation": "documentation",
    }

    for source, target in mappings.items():
@@ -1144,6 +1769,12 @@ Examples:
        default=False,
        help="Skip configuration pattern extraction from config files (JSON, YAML, TOML, ENV, etc.) (default: enabled)",
    )
+    parser.add_argument(
+        "--skip-docs",
+        action="store_true",
+        default=False,
+        help="Skip project documentation extraction from markdown files (README, docs/, etc.) (default: enabled)",
+    )
    parser.add_argument(
        "--ai-mode",
        choices=["auto", "api", "local", "none"],
@@ -1257,6 +1888,7 @@ Examples:
            extract_test_examples=not args.skip_test_examples,
            build_how_to_guides=not args.skip_how_to_guides,
            extract_config_patterns=not args.skip_config_patterns,
+            extract_docs=not args.skip_docs,
            enhance_level=args.enhance_level,  # AI enhancement level (0-3)
        )

--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -315,6 +315,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
    analyze_parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
    analyze_parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
    analyze_parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
+    analyze_parser.add_argument("--skip-docs", action="store_true", help="Skip project docs (README, docs/)")
    analyze_parser.add_argument("--no-comments", action="store_true", help="Skip comments")
    analyze_parser.add_argument("--verbose", action="store_true", help="Verbose logging")

@@ -609,6 +610,8 @@ def main(argv: list[str] | None = None) -> int:
                sys.argv.append("--skip-how-to-guides")
            if args.skip_config_patterns:
                sys.argv.append("--skip-config-patterns")
+            if args.skip_docs:
+                sys.argv.append("--skip-docs")
            if args.no_comments:
                sys.argv.append("--no-comments")
            if args.verbose:
--- a/src/skill_seekers/mcp/tools/scraping_tools.py
+++ b/src/skill_seekers/mcp/tools/scraping_tools.py
@@ -464,6 +464,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
            - skip_test_examples (bool, optional): Skip test example extraction (default: False)
            - skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
            - skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
+            - skip_docs (bool, optional): Skip project documentation extraction (default: False)

    Returns:
        List[TextContent]: Tool execution results
@@ -497,6 +498,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
    skip_test_examples = args.get("skip_test_examples", False)
    skip_how_to_guides = args.get("skip_how_to_guides", False)
    skip_config_patterns = args.get("skip_config_patterns", False)
+    skip_docs = args.get("skip_docs", False)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
@@ -526,6 +528,8 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]:
        cmd.append("--skip-how-to-guides")
    if skip_config_patterns:
        cmd.append("--skip-config-patterns")
+    if skip_docs:
+        cmd.append("--skip-docs")

    # Adjust timeout based on enhance_level
    timeout = 600  # 10 minutes base
--- a/tests/test_analyze_command.py
+++ b/tests/test_analyze_command.py
@@ -74,7 +74,8 @@ class TestAnalyzeSubcommand(unittest.TestCase):
            "--skip-patterns",
            "--skip-test-examples",
            "--skip-how-to-guides",
-            "--skip-config-patterns"
+            "--skip-config-patterns",
+            "--skip-docs"
        ])
        self.assertTrue(args.skip_api_reference)
        self.assertTrue(args.skip_dependency_graph)
@@ -82,6 +83,7 @@ class TestAnalyzeSubcommand(unittest.TestCase):
        self.assertTrue(args.skip_test_examples)
        self.assertTrue(args.skip_how_to_guides)
        self.assertTrue(args.skip_config_patterns)
+        self.assertTrue(args.skip_docs)

    def test_backward_compatible_depth_flag(self):
        """Test that deprecated --depth flag still works."""
--- a/tests/test_codebase_scraper.py
+++ b/tests/test_codebase_scraper.py
@@ -21,10 +21,17 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))

 from skill_seekers.cli.codebase_scraper import (
    DEFAULT_EXCLUDED_DIRS,
+    FOLDER_CATEGORIES,
+    MARKDOWN_EXTENSIONS,
+    ROOT_DOC_CATEGORIES,
+    categorize_markdown_file,
    detect_language,
+    extract_markdown_structure,
+    generate_markdown_summary,
    load_gitignore,
    should_exclude_dir,
    walk_directory,
+    walk_markdown_files,
 )


@@ -201,6 +208,191 @@ class TestGitignoreLoading(unittest.TestCase):
            self.assertIsNotNone(spec)


+class TestMarkdownDocumentation(unittest.TestCase):
+    """Tests for markdown documentation extraction (C3.9)"""
+
+    def setUp(self):
+        """Set up test environment"""
+        self.temp_dir = tempfile.mkdtemp()
+        self.root = Path(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test environment"""
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_markdown_extensions(self):
+        """Test that markdown extensions are properly defined."""
+        self.assertIn(".md", MARKDOWN_EXTENSIONS)
+        self.assertIn(".markdown", MARKDOWN_EXTENSIONS)
+
+    def test_root_doc_categories(self):
+        """Test root document category mapping."""
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("readme"), "overview")
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("changelog"), "changelog")
+        self.assertEqual(ROOT_DOC_CATEGORIES.get("architecture"), "architecture")
+
+    def test_folder_categories(self):
+        """Test folder category mapping."""
+        self.assertEqual(FOLDER_CATEGORIES.get("guides"), "guides")
+        self.assertEqual(FOLDER_CATEGORIES.get("tutorials"), "guides")
+        self.assertEqual(FOLDER_CATEGORIES.get("workflows"), "workflows")
+        self.assertEqual(FOLDER_CATEGORIES.get("architecture"), "architecture")
+
+    def test_walk_markdown_files(self):
+        """Test walking directory for markdown files."""
+        # Create test markdown files
+        (self.root / "README.md").write_text("# Test README")
+        (self.root / "test.py").write_text("print('test')")
+
+        docs_dir = self.root / "docs"
+        docs_dir.mkdir()
+        (docs_dir / "guide.md").write_text("# Guide")
+
+        files = walk_markdown_files(self.root)
+
+        # Should find markdown files only
+        self.assertEqual(len(files), 2)
+        filenames = [f.name for f in files]
+        self.assertIn("README.md", filenames)
+        self.assertIn("guide.md", filenames)
+
+    def test_categorize_root_readme(self):
+        """Test categorizing root README file."""
+        readme_path = self.root / "README.md"
+        readme_path.write_text("# Test")
+
+        category = categorize_markdown_file(readme_path, self.root)
+        self.assertEqual(category, "overview")
+
+    def test_categorize_changelog(self):
+        """Test categorizing CHANGELOG file."""
+        changelog_path = self.root / "CHANGELOG.md"
+        changelog_path.write_text("# Changelog")
+
+        category = categorize_markdown_file(changelog_path, self.root)
+        self.assertEqual(category, "changelog")
+
+    def test_categorize_docs_guide(self):
+        """Test categorizing file in docs/guides folder."""
+        guides_dir = self.root / "docs" / "guides"
+        guides_dir.mkdir(parents=True)
+        guide_path = guides_dir / "getting-started.md"
+        guide_path.write_text("# Getting Started")
+
+        category = categorize_markdown_file(guide_path, self.root)
+        self.assertEqual(category, "guides")
+
+    def test_categorize_architecture(self):
+        """Test categorizing architecture documentation."""
+        arch_dir = self.root / "docs" / "architecture"
+        arch_dir.mkdir(parents=True)
+        arch_path = arch_dir / "overview.md"
+        arch_path.write_text("# Architecture")
+
+        category = categorize_markdown_file(arch_path, self.root)
+        self.assertEqual(category, "architecture")
+
+
+class TestMarkdownStructureExtraction(unittest.TestCase):
+    """Tests for markdown structure extraction"""
+
+    def test_extract_headers(self):
+        """Test extracting headers from markdown."""
+        content = """# Main Title
+
+## Section 1
+Some content
+
+### Subsection
+More content
+
+## Section 2
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(structure["title"], "Main Title")
+        self.assertEqual(len(structure["headers"]), 4)
+        self.assertEqual(structure["headers"][0]["level"], 1)
+        self.assertEqual(structure["headers"][1]["level"], 2)
+
+    def test_extract_code_blocks(self):
+        """Test extracting code blocks from markdown."""
+        content = """# Example
+
+```python
+def hello():
+    print("Hello")
+```
+
+```javascript
+console.log("test");
+```
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(len(structure["code_blocks"]), 2)
+        self.assertEqual(structure["code_blocks"][0]["language"], "python")
+        self.assertEqual(structure["code_blocks"][1]["language"], "javascript")
+
+    def test_extract_links(self):
+        """Test extracting links from markdown."""
+        content = """# Links
+
+Check out [Example](https://example.com) and [Another](./local.md).
+"""
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(len(structure["links"]), 2)
+        self.assertEqual(structure["links"][0]["text"], "Example")
+        self.assertEqual(structure["links"][0]["url"], "https://example.com")
+
+    def test_word_and_line_count(self):
+        """Test word and line count."""
+        content = "First line\nSecond line\nThird line"
+        structure = extract_markdown_structure(content)
+
+        self.assertEqual(structure["line_count"], 3)
+        self.assertEqual(structure["word_count"], 6)  # First, line, Second, line, Third, line
+
+
+class TestMarkdownSummaryGeneration(unittest.TestCase):
+    """Tests for markdown summary generation"""
+
+    def test_generate_summary_with_title(self):
+        """Test summary includes title."""
+        content = "# My Title\n\nSome content here."
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure)
+
+        self.assertIn("**My Title**", summary)
+
+    def test_generate_summary_with_sections(self):
+        """Test summary includes section names."""
+        content = """# Main
+
+## Getting Started
+Content
+
+## Installation
+Content
+
+## Usage
+Content
+"""
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure)
+
+        self.assertIn("Sections:", summary)
+
+    def test_generate_summary_truncation(self):
+        """Test summary is truncated to max length."""
+        content = "# Title\n\n" + "Long content. " * 100
+        structure = extract_markdown_structure(content)
+        summary = generate_markdown_summary(content, structure, max_length=200)
+
+        self.assertLessEqual(len(summary), 210)  # Allow some buffer for truncation marker
+
+
 if __name__ == "__main__":
    # Run tests with verbose output
    unittest.main(verbosity=2)