feat(B2): add Microsoft Word (.docx) support
Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -389,6 +389,18 @@ PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
},
|
||||
}
|
||||
|
||||
# Word document specific (from word.py)
|
||||
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"docx": {
|
||||
"flags": ("--docx",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "DOCX file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Multi-source config specific (from unified_scraper.py)
|
||||
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"merge_mode": {
|
||||
@@ -471,6 +483,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
|
||||
"github": GITHUB_ARGUMENTS,
|
||||
"local": LOCAL_ARGUMENTS,
|
||||
"pdf": PDF_ARGUMENTS,
|
||||
"word": WORD_ARGUMENTS,
|
||||
"config": CONFIG_ARGUMENTS,
|
||||
}
|
||||
return source_args.get(source_type, {})
|
||||
@@ -507,12 +520,13 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
- 'github': Universal + github-specific
|
||||
- 'local': Universal + local-specific
|
||||
- 'pdf': Universal + pdf-specific
|
||||
- 'word': Universal + word-specific
|
||||
- 'advanced': Advanced/rare arguments
|
||||
- 'all': All 120+ arguments
|
||||
|
||||
Args:
|
||||
parser: ArgumentParser to add arguments to
|
||||
mode: Help mode (default, web, github, local, pdf, advanced, all)
|
||||
mode: Help mode (default, web, github, local, pdf, word, advanced, all)
|
||||
"""
|
||||
# Positional argument for source
|
||||
parser.add_argument(
|
||||
@@ -543,6 +557,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
for arg_name, arg_def in PDF_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["word", "all"]:
|
||||
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["config", "all"]:
|
||||
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
Reference in New Issue
Block a user