feat(B2): add Microsoft Word (.docx) support

Implements ROADMAP task B2 — full .docx scraping support via mammoth +
python-docx, producing SKILL.md + references/ output identical to other
source types.

New files:
- src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class +
  main() entry point (~600 lines); mammoth → BeautifulSoup pipeline;
  handles headings, code detection (incl. monospace <p><br> blocks),
  tables, images, metadata extraction
- src/skill_seekers/cli/arguments/word.py — add_word_arguments() +
  WORD_ARGUMENTS dict
- src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified
  CLI parser registry
- tests/test_word_scraper.py — comprehensive test suite (~300 lines)

Modified files:
- src/skill_seekers/cli/main.py — registered "word" command module
- src/skill_seekers/cli/source_detector.py — .docx auto-detection +
  _detect_word() classmethod
- src/skill_seekers/cli/create_command.py — _route_word() + --help-word
- src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing
- src/skill_seekers/cli/arguments/__init__.py — export word args
- src/skill_seekers/cli/parsers/__init__.py — register WordParser
- src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration
- src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead
  of stub; remove [:3] reference file limit; capture run_workflows return
- src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary
  open_issues[:20] / closed_issues[:10] reference file limits
- pyproject.toml — skill-seekers-word entry point + docx optional dep
- tests/test_cli_parsers.py — update parser count 21→22

Bug fixes applied during real-world testing:
- Code detection: detect monospace <p><br> blocks as code (mammoth
  renders Courier paragraphs this way, not as <pre>/<code>)
- Language detector: fix wrong method name detect_from_text →
  detect_from_code
- Description inference: pass None from main() so extract_docx() can
  infer description from Word document subject/title metadata
- Bullet-point guard: exclude prose starting with •/-/* from code scoring
- Enhancement: implement real API/LOCAL enhancement (was stub)
- pip install message: add quotes around skill-seekers[docx]

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-25 21:47:30 +03:00
parent e42aade992
commit b81d55fda0
17 changed files with 2214 additions and 67 deletions

View File

@@ -21,6 +21,7 @@ from .common import add_common_arguments, COMMON_ARGUMENTS
from .scrape import add_scrape_arguments, SCRAPE_ARGUMENTS
from .github import add_github_arguments, GITHUB_ARGUMENTS
from .pdf import add_pdf_arguments, PDF_ARGUMENTS
from .word import add_word_arguments, WORD_ARGUMENTS
from .analyze import add_analyze_arguments, ANALYZE_ARGUMENTS
from .unified import add_unified_arguments, UNIFIED_ARGUMENTS
from .package import add_package_arguments, PACKAGE_ARGUMENTS
@@ -38,11 +39,13 @@ __all__ = [
"add_package_arguments",
"add_upload_arguments",
"add_enhance_arguments",
"add_word_arguments",
# Data
"COMMON_ARGUMENTS",
"SCRAPE_ARGUMENTS",
"GITHUB_ARGUMENTS",
"PDF_ARGUMENTS",
"WORD_ARGUMENTS",
"ANALYZE_ARGUMENTS",
"UNIFIED_ARGUMENTS",
"PACKAGE_ARGUMENTS",

View File

@@ -389,6 +389,18 @@ PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
},
}
# Word document specific (from word.py)
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
"docx": {
"flags": ("--docx",),
"kwargs": {
"type": str,
"help": "DOCX file path",
"metavar": "PATH",
},
},
}
# Multi-source config specific (from unified_scraper.py)
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
"merge_mode": {
@@ -471,6 +483,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
"github": GITHUB_ARGUMENTS,
"local": LOCAL_ARGUMENTS,
"pdf": PDF_ARGUMENTS,
"word": WORD_ARGUMENTS,
"config": CONFIG_ARGUMENTS,
}
return source_args.get(source_type, {})
@@ -507,12 +520,13 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
- 'github': Universal + github-specific
- 'local': Universal + local-specific
- 'pdf': Universal + pdf-specific
- 'word': Universal + word-specific
- 'advanced': Advanced/rare arguments
- 'all': All 120+ arguments
Args:
parser: ArgumentParser to add arguments to
mode: Help mode (default, web, github, local, pdf, advanced, all)
mode: Help mode (default, web, github, local, pdf, word, advanced, all)
"""
# Positional argument for source
parser.add_argument(
@@ -543,6 +557,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
for arg_name, arg_def in PDF_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["word", "all"]:
for arg_name, arg_def in WORD_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["config", "all"]:
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])

View File

@@ -0,0 +1,66 @@
"""Word document command argument definitions.
This module defines ALL arguments for the word command in ONE place.
Both word_scraper.py (standalone) and parsers/word_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# Word-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
"docx": {
"flags": ("--docx",),
"kwargs": {
"type": str,
"help": "Direct DOCX file path",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_word_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all word command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds Word-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for Word.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Word
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for Word), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
)
# Word-specific args
for arg_name, arg_def in WORD_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -131,6 +131,8 @@ class CreateCommand:
return self._route_local()
elif self.source_info.type == "pdf":
return self._route_pdf()
elif self.source_info.type == "word":
return self._route_word()
elif self.source_info.type == "config":
return self._route_config()
else:
@@ -320,6 +322,29 @@ class CreateCommand:
finally:
sys.argv = original_argv
def _route_word(self) -> int:
"""Route to Word document scraper (word_scraper.py)."""
from skill_seekers.cli import word_scraper
# Reconstruct argv for word_scraper
argv = ["word_scraper"]
# Add DOCX file
file_path = self.source_info.parsed["file_path"]
argv.extend(["--docx", file_path])
# Add universal arguments
self._add_common_args(argv)
# Call word_scraper with modified argv
logger.debug(f"Calling word_scraper with argv: {argv}")
original_argv = sys.argv
try:
sys.argv = argv
return word_scraper.main()
finally:
sys.argv = original_argv
def _route_config(self) -> int:
"""Route to unified scraper for config files (unified_scraper.py)."""
from skill_seekers.cli import unified_scraper
@@ -442,6 +467,7 @@ Examples:
GitHub: skill-seekers create facebook/react -p standard
Local: skill-seekers create ./my-project -p comprehensive
PDF: skill-seekers create tutorial.pdf --ocr
DOCX: skill-seekers create document.docx
Config: skill-seekers create configs/react.json
Source Auto-Detection:
@@ -449,6 +475,7 @@ Source Auto-Detection:
• owner/repo → GitHub analysis
• ./path → local codebase
• file.pdf → PDF extraction
• file.docx → Word document extraction
• file.json → multi-source config
Progressive Help (13 → 120+ flags):
@@ -483,6 +510,9 @@ Common Workflows:
"--help-local", action="store_true", help=argparse.SUPPRESS, dest="_help_local"
)
parser.add_argument("--help-pdf", action="store_true", help=argparse.SUPPRESS, dest="_help_pdf")
parser.add_argument(
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
)
parser.add_argument(
"--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
)
@@ -532,6 +562,15 @@ Common Workflows:
add_create_arguments(parser_pdf, mode="pdf")
parser_pdf.print_help()
return 0
elif args._help_word:
parser_word = argparse.ArgumentParser(
prog="skill-seekers create",
description="Create skill from Word document (.docx)",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
add_create_arguments(parser_word, mode="word")
parser_word.print_help()
return 0
elif args._help_config:
parser_config = argparse.ArgumentParser(
prog="skill-seekers create",

View File

@@ -1296,14 +1296,14 @@ Use this skill when you need to:
closed_issues = [i for i in issues if i["state"] == "closed"]
content += f"## Open Issues ({len(open_issues)})\n\n"
for issue in open_issues[:20]:
for issue in open_issues:
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
content += f"### #{issue['number']}: {issue['title']}\n"
content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
content += f"[View on GitHub]({issue['url']})\n\n"
content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
for issue in closed_issues[:10]:
for issue in closed_issues:
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
content += f"### #{issue['number']}: {issue['title']}\n"
content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"

View File

@@ -47,6 +47,7 @@ COMMAND_MODULES = {
"scrape": "skill_seekers.cli.doc_scraper",
"github": "skill_seekers.cli.github_scraper",
"pdf": "skill_seekers.cli.pdf_scraper",
"word": "skill_seekers.cli.word_scraper",
"unified": "skill_seekers.cli.unified_scraper",
"enhance": "skill_seekers.cli.enhance_command",
"enhance-status": "skill_seekers.cli.enhance_status",

View File

@@ -12,6 +12,7 @@ from .config_parser import ConfigParser
from .scrape_parser import ScrapeParser
from .github_parser import GitHubParser
from .pdf_parser import PDFParser
from .word_parser import WordParser
from .unified_parser import UnifiedParser
from .enhance_parser import EnhanceParser
from .enhance_status_parser import EnhanceStatusParser
@@ -41,6 +42,7 @@ PARSERS = [
EnhanceParser(),
EnhanceStatusParser(),
PDFParser(),
WordParser(),
UnifiedParser(),
EstimateParser(),
InstallParser(),

View File

@@ -0,0 +1,32 @@
"""Word document subcommand parser.
Uses shared argument definitions from arguments.word to ensure
consistency with the standalone word_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.word import add_word_arguments
class WordParser(SubcommandParser):
"""Parser for word subcommand."""
@property
def name(self) -> str:
return "word"
@property
def help(self) -> str:
return "Extract from Word document (.docx)"
@property
def description(self) -> str:
return "Extract content from Word document (.docx) and generate skill"
def add_arguments(self, parser):
"""Add word-specific arguments.
Uses shared argument definitions to ensure consistency
with word_scraper.py (standalone scraper).
"""
add_word_arguments(parser)

View File

@@ -319,7 +319,7 @@ class PDFToSkillConverter:
code_list = page.get("code_samples") or page.get("code_blocks")
if code_list:
f.write("### Code Examples\n\n")
for code in code_list[:3]: # Limit to top 3
for code in code_list:
lang = code.get("language", "")
f.write(f"```{lang}\n{code['code']}\n```\n\n")
@@ -721,21 +721,44 @@ def main():
# ═══════════════════════════════════════════════════════════════════════════
# Traditional Enhancement (complements workflow system)
# ═══════════════════════════════════════════════════════════════════════════
# Note: Runs independently of workflow system (they complement each other)
if getattr(args, "enhance_level", 0) > 0:
# Traditional AI enhancement (API or LOCAL mode)
import os
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
mode = "API" if api_key else "LOCAL"
print("\n" + "=" * 80)
print("🤖 Traditional AI Enhancement")
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
print("=" * 80)
if workflow_executed:
print(f" Running after workflow: {workflow_name}")
print(
" (Workflow provides specialized analysis, enhancement provides general improvements)"
)
print(" (Use --enhance-workflow for more control)")
print("")
# Note: PDF scraper uses enhance_level instead of enhance/enhance_local
# This is consistent with the new unified enhancement system
skill_dir = converter.skill_dir
if api_key:
try:
from skill_seekers.cli.enhance_skill import enhance_skill_md
enhance_skill_md(skill_dir, api_key)
print("✅ API enhancement complete!")
except ImportError:
print("❌ API enhancement not available. Falling back to LOCAL mode...")
from pathlib import Path
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
else:
from pathlib import Path
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
except RuntimeError as e:
print(f"\n❌ Error: {e}", file=sys.stderr)

View File

@@ -60,6 +60,9 @@ class SourceDetector:
if source.endswith(".pdf"):
return cls._detect_pdf(source)
if source.endswith(".docx"):
return cls._detect_word(source)
# 2. Directory detection
if os.path.isdir(source):
return cls._detect_local(source)
@@ -85,6 +88,7 @@ class SourceDetector:
" GitHub: skill-seekers create facebook/react\n"
" Local: skill-seekers create ./my-project\n"
" PDF: skill-seekers create tutorial.pdf\n"
" DOCX: skill-seekers create document.docx\n"
" Config: skill-seekers create configs/react.json"
)
@@ -104,6 +108,14 @@ class SourceDetector:
type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_word(cls, source: str) -> SourceInfo:
"""Detect Word document (.docx) source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_local(cls, source: str) -> SourceInfo:
"""Detect local directory source."""
@@ -190,6 +202,13 @@ class SourceDetector:
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "word":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"Word document does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "config":
config_path = source_info.parsed["config_path"]
if not os.path.exists(config_path):

View File

@@ -73,11 +73,12 @@ class UnifiedScraper:
"documentation": [], # List of doc sources
"github": [], # List of github sources
"pdf": [], # List of pdf sources
"word": [], # List of word sources
"local": [], # List of local sources (docs or code)
}
# Track source index for unique naming (multi-source support)
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "local": 0}
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}
# Output paths - cleaner organization
self.name = self.config["name"]
@@ -151,6 +152,8 @@ class UnifiedScraper:
self._scrape_github(source)
elif source_type == "pdf":
self._scrape_pdf(source)
elif source_type == "word":
self._scrape_word(source)
elif source_type == "local":
self._scrape_local(source)
else:
@@ -514,6 +517,65 @@ class UnifiedScraper:
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
def _scrape_word(self, source: dict[str, Any]):
"""Scrape Word document (.docx)."""
try:
from skill_seekers.cli.word_scraper import WordToSkillConverter
except ImportError:
logger.error("word_scraper.py not found")
return
# Multi-source support: Get unique index for this Word source
idx = self._source_counters["word"]
self._source_counters["word"] += 1
# Extract Word identifier for unique naming (filename without extension)
docx_path = source["path"]
docx_id = os.path.splitext(os.path.basename(docx_path))[0]
# Create config for Word scraper
word_config = {
"name": f"{self.name}_word_{idx}_{docx_id}",
"docx_path": source["path"],
"description": f"{source.get('name', docx_id)} documentation",
}
# Scrape
logger.info(f"Scraping Word document: {source['path']}")
converter = WordToSkillConverter(word_config)
# Extract Word content
converter.extract_docx()
# Load extracted data from file
word_data_file = converter.data_file
with open(word_data_file, encoding="utf-8") as f:
word_data = json.load(f)
# Copy data file to cache
cache_word_data = os.path.join(self.data_dir, f"word_data_{idx}_{docx_id}.json")
shutil.copy(word_data_file, cache_word_data)
# Append to list
self.scraped_data["word"].append(
{
"docx_path": docx_path,
"docx_id": docx_id,
"idx": idx,
"data": word_data,
"data_file": cache_word_data,
}
)
# Build standalone SKILL.md for synthesis
try:
converter.build_skill()
logger.info("✅ Word: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone Word SKILL.md: {e}")
logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")
def _scrape_local(self, source: dict[str, Any]):
"""
Scrape local directory (documentation files or source code).

File diff suppressed because it is too large Load Diff