feat(B2): add Microsoft Word (.docx) support
Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -319,7 +319,7 @@ class PDFToSkillConverter:
|
||||
code_list = page.get("code_samples") or page.get("code_blocks")
|
||||
if code_list:
|
||||
f.write("### Code Examples\n\n")
|
||||
for code in code_list[:3]: # Limit to top 3
|
||||
for code in code_list:
|
||||
lang = code.get("language", "")
|
||||
f.write(f"```{lang}\n{code['code']}\n```\n\n")
|
||||
|
||||
@@ -721,21 +721,44 @@ def main():
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Traditional Enhancement (complements workflow system)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Note: Runs independently of workflow system (they complement each other)
|
||||
if getattr(args, "enhance_level", 0) > 0:
|
||||
# Traditional AI enhancement (API or LOCAL mode)
|
||||
import os
|
||||
|
||||
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
|
||||
mode = "API" if api_key else "LOCAL"
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("🤖 Traditional AI Enhancement")
|
||||
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
|
||||
print("=" * 80)
|
||||
if workflow_executed:
|
||||
print(f" Running after workflow: {workflow_name}")
|
||||
print(
|
||||
" (Workflow provides specialized analysis, enhancement provides general improvements)"
|
||||
)
|
||||
print(" (Use --enhance-workflow for more control)")
|
||||
print("")
|
||||
# Note: PDF scraper uses enhance_level instead of enhance/enhance_local
|
||||
# This is consistent with the new unified enhancement system
|
||||
|
||||
skill_dir = converter.skill_dir
|
||||
if api_key:
|
||||
try:
|
||||
from skill_seekers.cli.enhance_skill import enhance_skill_md
|
||||
|
||||
enhance_skill_md(skill_dir, api_key)
|
||||
print("✅ API enhancement complete!")
|
||||
except ImportError:
|
||||
print("❌ API enhancement not available. Falling back to LOCAL mode...")
|
||||
from pathlib import Path
|
||||
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
||||
|
||||
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
||||
enhancer.run(headless=True)
|
||||
print("✅ Local enhancement complete!")
|
||||
else:
|
||||
from pathlib import Path
|
||||
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
||||
|
||||
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
||||
enhancer.run(headless=True)
|
||||
print("✅ Local enhancement complete!")
|
||||
|
||||
except RuntimeError as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user