feat(B2): add Microsoft Word (.docx) support

Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 21:47:30 +03:00
parent e42aade992
commit b81d55fda0
17 changed files with 2214 additions and 67 deletions
--- a/src/skill_seekers/cli/arguments/init.py
+++ b/src/skill_seekers/cli/arguments/init.py
@@ -21,6 +21,7 @@ from .common import add_common_arguments, COMMON_ARGUMENTS
 from .scrape import add_scrape_arguments, SCRAPE_ARGUMENTS
 from .github import add_github_arguments, GITHUB_ARGUMENTS
 from .pdf import add_pdf_arguments, PDF_ARGUMENTS
+from .word import add_word_arguments, WORD_ARGUMENTS
 from .analyze import add_analyze_arguments, ANALYZE_ARGUMENTS
 from .unified import add_unified_arguments, UNIFIED_ARGUMENTS
 from .package import add_package_arguments, PACKAGE_ARGUMENTS
@@ -38,11 +39,13 @@ __all__ = [
    "add_package_arguments",
    "add_upload_arguments",
    "add_enhance_arguments",
+    "add_word_arguments",
    # Data
    "COMMON_ARGUMENTS",
    "SCRAPE_ARGUMENTS",
    "GITHUB_ARGUMENTS",
    "PDF_ARGUMENTS",
+    "WORD_ARGUMENTS",
    "ANALYZE_ARGUMENTS",
    "UNIFIED_ARGUMENTS",
    "PACKAGE_ARGUMENTS",
--- a/src/skill_seekers/cli/arguments/create.py
+++ b/src/skill_seekers/cli/arguments/create.py
@@ -389,6 +389,18 @@ PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
    },
 }

+# Word document specific (from word.py)
+WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "docx": {
+        "flags": ("--docx",),
+        "kwargs": {
+            "type": str,
+            "help": "DOCX file path",
+            "metavar": "PATH",
+        },
+    },
+}
+
 # Multi-source config specific (from unified_scraper.py)
 CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
    "merge_mode": {
@@ -471,6 +483,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
        "github": GITHUB_ARGUMENTS,
        "local": LOCAL_ARGUMENTS,
        "pdf": PDF_ARGUMENTS,
+        "word": WORD_ARGUMENTS,
        "config": CONFIG_ARGUMENTS,
    }
    return source_args.get(source_type, {})
@@ -507,12 +520,13 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
    - 'github': Universal + github-specific
    - 'local': Universal + local-specific
    - 'pdf': Universal + pdf-specific
+    - 'word': Universal + word-specific
    - 'advanced': Advanced/rare arguments
    - 'all': All 120+ arguments

    Args:
        parser: ArgumentParser to add arguments to
-        mode: Help mode (default, web, github, local, pdf, advanced, all)
+        mode: Help mode (default, web, github, local, pdf, word, advanced, all)
    """
    # Positional argument for source
    parser.add_argument(
@@ -543,6 +557,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
        for arg_name, arg_def in PDF_ARGUMENTS.items():
            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])

+    if mode in ["word", "all"]:
+        for arg_name, arg_def in WORD_ARGUMENTS.items():
+            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
+
    if mode in ["config", "all"]:
        for arg_name, arg_def in CONFIG_ARGUMENTS.items():
            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
--- a/src/skill_seekers/cli/arguments/word.py
+++ b/src/skill_seekers/cli/arguments/word.py
@@ -0,0 +1,66 @@
+"""Word document command argument definitions.
+
+This module defines ALL arguments for the word command in ONE place.
+Both word_scraper.py (standalone) and parsers/word_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# Word-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "docx": {
+        "flags": ("--docx",),
+        "kwargs": {
+            "type": str,
+            "help": "Direct DOCX file path",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_word_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all word command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds Word-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for Word.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Word
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Word), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
+            )
+
+    # Word-specific args
+    for arg_name, arg_def in WORD_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -131,6 +131,8 @@ class CreateCommand:
            return self._route_local()
        elif self.source_info.type == "pdf":
            return self._route_pdf()
+        elif self.source_info.type == "word":
+            return self._route_word()
        elif self.source_info.type == "config":
            return self._route_config()
        else:
@@ -320,6 +322,29 @@ class CreateCommand:
        finally:
            sys.argv = original_argv

+    def _route_word(self) -> int:
+        """Route to Word document scraper (word_scraper.py)."""
+        from skill_seekers.cli import word_scraper
+
+        # Reconstruct argv for word_scraper
+        argv = ["word_scraper"]
+
+        # Add DOCX file
+        file_path = self.source_info.parsed["file_path"]
+        argv.extend(["--docx", file_path])
+
+        # Add universal arguments
+        self._add_common_args(argv)
+
+        # Call word_scraper with modified argv
+        logger.debug(f"Calling word_scraper with argv: {argv}")
+        original_argv = sys.argv
+        try:
+            sys.argv = argv
+            return word_scraper.main()
+        finally:
+            sys.argv = original_argv
+
    def _route_config(self) -> int:
        """Route to unified scraper for config files (unified_scraper.py)."""
        from skill_seekers.cli import unified_scraper
@@ -442,6 +467,7 @@ Examples:
  GitHub:   skill-seekers create facebook/react -p standard
  Local:    skill-seekers create ./my-project -p comprehensive
  PDF:      skill-seekers create tutorial.pdf --ocr
+  DOCX:     skill-seekers create document.docx
  Config:   skill-seekers create configs/react.json

 Source Auto-Detection:
@@ -449,6 +475,7 @@ Source Auto-Detection:
  • owner/repo → GitHub analysis
  • ./path → local codebase
  • file.pdf → PDF extraction
+  • file.docx → Word document extraction
  • file.json → multi-source config

 Progressive Help (13 → 120+ flags):
@@ -483,6 +510,9 @@ Common Workflows:
        "--help-local", action="store_true", help=argparse.SUPPRESS, dest="_help_local"
    )
    parser.add_argument("--help-pdf", action="store_true", help=argparse.SUPPRESS, dest="_help_pdf")
+    parser.add_argument(
+        "--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
+    )
    parser.add_argument(
        "--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
    )
@@ -532,6 +562,15 @@ Common Workflows:
        add_create_arguments(parser_pdf, mode="pdf")
        parser_pdf.print_help()
        return 0
+    elif args._help_word:
+        parser_word = argparse.ArgumentParser(
+            prog="skill-seekers create",
+            description="Create skill from Word document (.docx)",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+        )
+        add_create_arguments(parser_word, mode="word")
+        parser_word.print_help()
+        return 0
    elif args._help_config:
        parser_config = argparse.ArgumentParser(
            prog="skill-seekers create",
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -1296,14 +1296,14 @@ Use this skill when you need to:
        closed_issues = [i for i in issues if i["state"] == "closed"]

        content += f"## Open Issues ({len(open_issues)})\n\n"
-        for issue in open_issues[:20]:
+        for issue in open_issues:
            labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
            content += f"### #{issue['number']}: {issue['title']}\n"
            content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
            content += f"[View on GitHub]({issue['url']})\n\n"

        content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
-        for issue in closed_issues[:10]:
+        for issue in closed_issues:
            labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
            content += f"### #{issue['number']}: {issue['title']}\n"
            content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -47,6 +47,7 @@ COMMAND_MODULES = {
    "scrape": "skill_seekers.cli.doc_scraper",
    "github": "skill_seekers.cli.github_scraper",
    "pdf": "skill_seekers.cli.pdf_scraper",
+    "word": "skill_seekers.cli.word_scraper",
    "unified": "skill_seekers.cli.unified_scraper",
    "enhance": "skill_seekers.cli.enhance_command",
    "enhance-status": "skill_seekers.cli.enhance_status",
--- a/src/skill_seekers/cli/parsers/init.py
+++ b/src/skill_seekers/cli/parsers/init.py
@@ -12,6 +12,7 @@ from .config_parser import ConfigParser
 from .scrape_parser import ScrapeParser
 from .github_parser import GitHubParser
 from .pdf_parser import PDFParser
+from .word_parser import WordParser
 from .unified_parser import UnifiedParser
 from .enhance_parser import EnhanceParser
 from .enhance_status_parser import EnhanceStatusParser
@@ -41,6 +42,7 @@ PARSERS = [
    EnhanceParser(),
    EnhanceStatusParser(),
    PDFParser(),
+    WordParser(),
    UnifiedParser(),
    EstimateParser(),
    InstallParser(),
--- a/src/skill_seekers/cli/parsers/word_parser.py
+++ b/src/skill_seekers/cli/parsers/word_parser.py
@@ -0,0 +1,32 @@
+"""Word document subcommand parser.
+
+Uses shared argument definitions from arguments.word to ensure
+consistency with the standalone word_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.word import add_word_arguments
+
+
+class WordParser(SubcommandParser):
+    """Parser for word subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "word"
+
+    @property
+    def help(self) -> str:
+        return "Extract from Word document (.docx)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from Word document (.docx) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add word-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with word_scraper.py (standalone scraper).
+        """
+        add_word_arguments(parser)
--- a/src/skill_seekers/cli/pdf_scraper.py
+++ b/src/skill_seekers/cli/pdf_scraper.py
@@ -319,7 +319,7 @@ class PDFToSkillConverter:
                code_list = page.get("code_samples") or page.get("code_blocks")
                if code_list:
                    f.write("### Code Examples\n\n")
-                    for code in code_list[:3]:  # Limit to top 3
+                    for code in code_list:
                        lang = code.get("language", "")
                        f.write(f"```{lang}\n{code['code']}\n```\n\n")

@@ -721,21 +721,44 @@ def main():
        # ═══════════════════════════════════════════════════════════════════════════
        # Traditional Enhancement (complements workflow system)
        # ═══════════════════════════════════════════════════════════════════════════
-        # Note: Runs independently of workflow system (they complement each other)
        if getattr(args, "enhance_level", 0) > 0:
-            # Traditional AI enhancement (API or LOCAL mode)
+            import os
+
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
            print("\n" + "=" * 80)
-            print("🤖 Traditional AI Enhancement")
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
            print("=" * 80)
            if workflow_executed:
                print(f"   Running after workflow: {workflow_name}")
                print(
                    "   (Workflow provides specialized analysis, enhancement provides general improvements)"
                )
-            print("   (Use --enhance-workflow for more control)")
            print("")
-            # Note: PDF scraper uses enhance_level instead of enhance/enhance_local
-            # This is consistent with the new unified enhancement system
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from pathlib import Path
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from pathlib import Path
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")

    except RuntimeError as e:
        print(f"\n❌ Error: {e}", file=sys.stderr)
--- a/src/skill_seekers/cli/source_detector.py
+++ b/src/skill_seekers/cli/source_detector.py
@@ -60,6 +60,9 @@ class SourceDetector:
        if source.endswith(".pdf"):
            return cls._detect_pdf(source)

+        if source.endswith(".docx"):
+            return cls._detect_word(source)
+
        # 2. Directory detection
        if os.path.isdir(source):
            return cls._detect_local(source)
@@ -85,6 +88,7 @@ class SourceDetector:
            "  GitHub: skill-seekers create facebook/react\n"
            "  Local:  skill-seekers create ./my-project\n"
            "  PDF:    skill-seekers create tutorial.pdf\n"
+            "  DOCX:   skill-seekers create document.docx\n"
            "  Config: skill-seekers create configs/react.json"
        )

@@ -104,6 +108,14 @@ class SourceDetector:
            type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

+    @classmethod
+    def _detect_word(cls, source: str) -> SourceInfo:
+        """Detect Word document (.docx) source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
    @classmethod
    def _detect_local(cls, source: str) -> SourceInfo:
        """Detect local directory source."""
@@ -190,6 +202,13 @@ class SourceDetector:
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

+        elif source_info.type == "word":
+            file_path = source_info.parsed["file_path"]
+            if not os.path.exists(file_path):
+                raise ValueError(f"Word document does not exist: {file_path}")
+            if not os.path.isfile(file_path):
+                raise ValueError(f"Path is not a file: {file_path}")
+
        elif source_info.type == "config":
            config_path = source_info.parsed["config_path"]
            if not os.path.exists(config_path):
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -73,11 +73,12 @@ class UnifiedScraper:
            "documentation": [],  # List of doc sources
            "github": [],  # List of github sources
            "pdf": [],  # List of pdf sources
+            "word": [],  # List of word sources
            "local": [],  # List of local sources (docs or code)
        }

        # Track source index for unique naming (multi-source support)
-        self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "local": 0}
+        self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}

        # Output paths - cleaner organization
        self.name = self.config["name"]
@@ -151,6 +152,8 @@ class UnifiedScraper:
                    self._scrape_github(source)
                elif source_type == "pdf":
                    self._scrape_pdf(source)
+                elif source_type == "word":
+                    self._scrape_word(source)
                elif source_type == "local":
                    self._scrape_local(source)
                else:
@@ -514,6 +517,65 @@ class UnifiedScraper:

        logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")

+    def _scrape_word(self, source: dict[str, Any]):
+        """Scrape Word document (.docx)."""
+        try:
+            from skill_seekers.cli.word_scraper import WordToSkillConverter
+        except ImportError:
+            logger.error("word_scraper.py not found")
+            return
+
+        # Multi-source support: Get unique index for this Word source
+        idx = self._source_counters["word"]
+        self._source_counters["word"] += 1
+
+        # Extract Word identifier for unique naming (filename without extension)
+        docx_path = source["path"]
+        docx_id = os.path.splitext(os.path.basename(docx_path))[0]
+
+        # Create config for Word scraper
+        word_config = {
+            "name": f"{self.name}_word_{idx}_{docx_id}",
+            "docx_path": source["path"],
+            "description": f"{source.get('name', docx_id)} documentation",
+        }
+
+        # Scrape
+        logger.info(f"Scraping Word document: {source['path']}")
+        converter = WordToSkillConverter(word_config)
+
+        # Extract Word content
+        converter.extract_docx()
+
+        # Load extracted data from file
+        word_data_file = converter.data_file
+        with open(word_data_file, encoding="utf-8") as f:
+            word_data = json.load(f)
+
+        # Copy data file to cache
+        cache_word_data = os.path.join(self.data_dir, f"word_data_{idx}_{docx_id}.json")
+        shutil.copy(word_data_file, cache_word_data)
+
+        # Append to list
+        self.scraped_data["word"].append(
+            {
+                "docx_path": docx_path,
+                "docx_id": docx_id,
+                "idx": idx,
+                "data": word_data,
+                "data_file": cache_word_data,
+            }
+        )
+
+        # Build standalone SKILL.md for synthesis
+        try:
+            converter.build_skill()
+            logger.info("✅ Word: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone Word SKILL.md: {e}")
+
+        logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")
+
    def _scrape_local(self, source: dict[str, Any]):
        """
        Scrape local directory (documentation files or source code).
--- a/src/skill_seekers/cli/word_scraper.py
+++ b/src/skill_seekers/cli/word_scraper.py