feat(B2): add Microsoft Word (.docx) support
Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -131,6 +131,8 @@ class CreateCommand:
|
||||
return self._route_local()
|
||||
elif self.source_info.type == "pdf":
|
||||
return self._route_pdf()
|
||||
elif self.source_info.type == "word":
|
||||
return self._route_word()
|
||||
elif self.source_info.type == "config":
|
||||
return self._route_config()
|
||||
else:
|
||||
@@ -320,6 +322,29 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_word(self) -> int:
|
||||
"""Route to Word document scraper (word_scraper.py)."""
|
||||
from skill_seekers.cli import word_scraper
|
||||
|
||||
# Reconstruct argv for word_scraper
|
||||
argv = ["word_scraper"]
|
||||
|
||||
# Add DOCX file
|
||||
file_path = self.source_info.parsed["file_path"]
|
||||
argv.extend(["--docx", file_path])
|
||||
|
||||
# Add universal arguments
|
||||
self._add_common_args(argv)
|
||||
|
||||
# Call word_scraper with modified argv
|
||||
logger.debug(f"Calling word_scraper with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return word_scraper.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_config(self) -> int:
|
||||
"""Route to unified scraper for config files (unified_scraper.py)."""
|
||||
from skill_seekers.cli import unified_scraper
|
||||
@@ -442,6 +467,7 @@ Examples:
|
||||
GitHub: skill-seekers create facebook/react -p standard
|
||||
Local: skill-seekers create ./my-project -p comprehensive
|
||||
PDF: skill-seekers create tutorial.pdf --ocr
|
||||
DOCX: skill-seekers create document.docx
|
||||
Config: skill-seekers create configs/react.json
|
||||
|
||||
Source Auto-Detection:
|
||||
@@ -449,6 +475,7 @@ Source Auto-Detection:
|
||||
• owner/repo → GitHub analysis
|
||||
• ./path → local codebase
|
||||
• file.pdf → PDF extraction
|
||||
• file.docx → Word document extraction
|
||||
• file.json → multi-source config
|
||||
|
||||
Progressive Help (13 → 120+ flags):
|
||||
@@ -483,6 +510,9 @@ Common Workflows:
|
||||
"--help-local", action="store_true", help=argparse.SUPPRESS, dest="_help_local"
|
||||
)
|
||||
parser.add_argument("--help-pdf", action="store_true", help=argparse.SUPPRESS, dest="_help_pdf")
|
||||
parser.add_argument(
|
||||
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
|
||||
)
|
||||
@@ -532,6 +562,15 @@ Common Workflows:
|
||||
add_create_arguments(parser_pdf, mode="pdf")
|
||||
parser_pdf.print_help()
|
||||
return 0
|
||||
elif args._help_word:
|
||||
parser_word = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
description="Create skill from Word document (.docx)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
add_create_arguments(parser_word, mode="word")
|
||||
parser_word.print_help()
|
||||
return 0
|
||||
elif args._help_config:
|
||||
parser_config = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
|
||||
Reference in New Issue
Block a user