feat(B2): add Microsoft Word (.docx) support

Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 21:47:30 +03:00
parent e42aade992
commit b81d55fda0
17 changed files with 2214 additions and 67 deletions
--- a/src/skill_seekers/cli/source_detector.py
+++ b/src/skill_seekers/cli/source_detector.py
@@ -60,6 +60,9 @@ class SourceDetector:
        if source.endswith(".pdf"):
            return cls._detect_pdf(source)

+        if source.endswith(".docx"):
+            return cls._detect_word(source)
+
        # 2. Directory detection
        if os.path.isdir(source):
            return cls._detect_local(source)
@@ -85,6 +88,7 @@ class SourceDetector:
            "  GitHub: skill-seekers create facebook/react\n"
            "  Local:  skill-seekers create ./my-project\n"
            "  PDF:    skill-seekers create tutorial.pdf\n"
+            "  DOCX:   skill-seekers create document.docx\n"
            "  Config: skill-seekers create configs/react.json"
        )

@@ -104,6 +108,14 @@ class SourceDetector:
            type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

+    @classmethod
+    def _detect_word(cls, source: str) -> SourceInfo:
+        """Detect Word document (.docx) source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
    @classmethod
    def _detect_local(cls, source: str) -> SourceInfo:
        """Detect local directory source."""
@@ -190,6 +202,13 @@ class SourceDetector:
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

+        elif source_info.type == "word":
+            file_path = source_info.parsed["file_path"]
+            if not os.path.exists(file_path):
+                raise ValueError(f"Word document does not exist: {file_path}")
+            if not os.path.isfile(file_path):
+                raise ValueError(f"Path is not a file: {file_path}")
+
        elif source_info.type == "config":
            config_path = source_info.parsed["config_path"]
            if not os.path.exists(config_path):