feat(B2): add Microsoft Word (.docx) support
Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from .common import add_common_arguments, COMMON_ARGUMENTS
|
||||
from .scrape import add_scrape_arguments, SCRAPE_ARGUMENTS
|
||||
from .github import add_github_arguments, GITHUB_ARGUMENTS
|
||||
from .pdf import add_pdf_arguments, PDF_ARGUMENTS
|
||||
from .word import add_word_arguments, WORD_ARGUMENTS
|
||||
from .analyze import add_analyze_arguments, ANALYZE_ARGUMENTS
|
||||
from .unified import add_unified_arguments, UNIFIED_ARGUMENTS
|
||||
from .package import add_package_arguments, PACKAGE_ARGUMENTS
|
||||
@@ -38,11 +39,13 @@ __all__ = [
|
||||
"add_package_arguments",
|
||||
"add_upload_arguments",
|
||||
"add_enhance_arguments",
|
||||
"add_word_arguments",
|
||||
# Data
|
||||
"COMMON_ARGUMENTS",
|
||||
"SCRAPE_ARGUMENTS",
|
||||
"GITHUB_ARGUMENTS",
|
||||
"PDF_ARGUMENTS",
|
||||
"WORD_ARGUMENTS",
|
||||
"ANALYZE_ARGUMENTS",
|
||||
"UNIFIED_ARGUMENTS",
|
||||
"PACKAGE_ARGUMENTS",
|
||||
|
||||
@@ -389,6 +389,18 @@ PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
},
|
||||
}
|
||||
|
||||
# Word document specific (from word.py)
|
||||
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"docx": {
|
||||
"flags": ("--docx",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "DOCX file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Multi-source config specific (from unified_scraper.py)
|
||||
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"merge_mode": {
|
||||
@@ -471,6 +483,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
|
||||
"github": GITHUB_ARGUMENTS,
|
||||
"local": LOCAL_ARGUMENTS,
|
||||
"pdf": PDF_ARGUMENTS,
|
||||
"word": WORD_ARGUMENTS,
|
||||
"config": CONFIG_ARGUMENTS,
|
||||
}
|
||||
return source_args.get(source_type, {})
|
||||
@@ -507,12 +520,13 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
- 'github': Universal + github-specific
|
||||
- 'local': Universal + local-specific
|
||||
- 'pdf': Universal + pdf-specific
|
||||
- 'word': Universal + word-specific
|
||||
- 'advanced': Advanced/rare arguments
|
||||
- 'all': All 120+ arguments
|
||||
|
||||
Args:
|
||||
parser: ArgumentParser to add arguments to
|
||||
mode: Help mode (default, web, github, local, pdf, advanced, all)
|
||||
mode: Help mode (default, web, github, local, pdf, word, advanced, all)
|
||||
"""
|
||||
# Positional argument for source
|
||||
parser.add_argument(
|
||||
@@ -543,6 +557,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
for arg_name, arg_def in PDF_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["word", "all"]:
|
||||
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["config", "all"]:
|
||||
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
66
src/skill_seekers/cli/arguments/word.py
Normal file
66
src/skill_seekers/cli/arguments/word.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Word document command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the word command in ONE place.
|
||||
Both word_scraper.py (standalone) and parsers/word_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Word-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"docx": {
|
||||
"flags": ("--docx",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Direct DOCX file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_word_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all word command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds Word-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for Word.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for Word
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for Word), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Word-specific args
|
||||
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
@@ -131,6 +131,8 @@ class CreateCommand:
|
||||
return self._route_local()
|
||||
elif self.source_info.type == "pdf":
|
||||
return self._route_pdf()
|
||||
elif self.source_info.type == "word":
|
||||
return self._route_word()
|
||||
elif self.source_info.type == "config":
|
||||
return self._route_config()
|
||||
else:
|
||||
@@ -320,6 +322,29 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_word(self) -> int:
|
||||
"""Route to Word document scraper (word_scraper.py)."""
|
||||
from skill_seekers.cli import word_scraper
|
||||
|
||||
# Reconstruct argv for word_scraper
|
||||
argv = ["word_scraper"]
|
||||
|
||||
# Add DOCX file
|
||||
file_path = self.source_info.parsed["file_path"]
|
||||
argv.extend(["--docx", file_path])
|
||||
|
||||
# Add universal arguments
|
||||
self._add_common_args(argv)
|
||||
|
||||
# Call word_scraper with modified argv
|
||||
logger.debug(f"Calling word_scraper with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return word_scraper.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_config(self) -> int:
|
||||
"""Route to unified scraper for config files (unified_scraper.py)."""
|
||||
from skill_seekers.cli import unified_scraper
|
||||
@@ -442,6 +467,7 @@ Examples:
|
||||
GitHub: skill-seekers create facebook/react -p standard
|
||||
Local: skill-seekers create ./my-project -p comprehensive
|
||||
PDF: skill-seekers create tutorial.pdf --ocr
|
||||
DOCX: skill-seekers create document.docx
|
||||
Config: skill-seekers create configs/react.json
|
||||
|
||||
Source Auto-Detection:
|
||||
@@ -449,6 +475,7 @@ Source Auto-Detection:
|
||||
• owner/repo → GitHub analysis
|
||||
• ./path → local codebase
|
||||
• file.pdf → PDF extraction
|
||||
• file.docx → Word document extraction
|
||||
• file.json → multi-source config
|
||||
|
||||
Progressive Help (13 → 120+ flags):
|
||||
@@ -483,6 +510,9 @@ Common Workflows:
|
||||
"--help-local", action="store_true", help=argparse.SUPPRESS, dest="_help_local"
|
||||
)
|
||||
parser.add_argument("--help-pdf", action="store_true", help=argparse.SUPPRESS, dest="_help_pdf")
|
||||
parser.add_argument(
|
||||
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
|
||||
)
|
||||
@@ -532,6 +562,15 @@ Common Workflows:
|
||||
add_create_arguments(parser_pdf, mode="pdf")
|
||||
parser_pdf.print_help()
|
||||
return 0
|
||||
elif args._help_word:
|
||||
parser_word = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
description="Create skill from Word document (.docx)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
add_create_arguments(parser_word, mode="word")
|
||||
parser_word.print_help()
|
||||
return 0
|
||||
elif args._help_config:
|
||||
parser_config = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
|
||||
@@ -1296,14 +1296,14 @@ Use this skill when you need to:
|
||||
closed_issues = [i for i in issues if i["state"] == "closed"]
|
||||
|
||||
content += f"## Open Issues ({len(open_issues)})\n\n"
|
||||
for issue in open_issues[:20]:
|
||||
for issue in open_issues:
|
||||
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
||||
content += f"### #{issue['number']}: {issue['title']}\n"
|
||||
content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
|
||||
content += f"[View on GitHub]({issue['url']})\n\n"
|
||||
|
||||
content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
|
||||
for issue in closed_issues[:10]:
|
||||
for issue in closed_issues:
|
||||
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
||||
content += f"### #{issue['number']}: {issue['title']}\n"
|
||||
content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
|
||||
|
||||
@@ -47,6 +47,7 @@ COMMAND_MODULES = {
|
||||
"scrape": "skill_seekers.cli.doc_scraper",
|
||||
"github": "skill_seekers.cli.github_scraper",
|
||||
"pdf": "skill_seekers.cli.pdf_scraper",
|
||||
"word": "skill_seekers.cli.word_scraper",
|
||||
"unified": "skill_seekers.cli.unified_scraper",
|
||||
"enhance": "skill_seekers.cli.enhance_command",
|
||||
"enhance-status": "skill_seekers.cli.enhance_status",
|
||||
|
||||
@@ -12,6 +12,7 @@ from .config_parser import ConfigParser
|
||||
from .scrape_parser import ScrapeParser
|
||||
from .github_parser import GitHubParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .word_parser import WordParser
|
||||
from .unified_parser import UnifiedParser
|
||||
from .enhance_parser import EnhanceParser
|
||||
from .enhance_status_parser import EnhanceStatusParser
|
||||
@@ -41,6 +42,7 @@ PARSERS = [
|
||||
EnhanceParser(),
|
||||
EnhanceStatusParser(),
|
||||
PDFParser(),
|
||||
WordParser(),
|
||||
UnifiedParser(),
|
||||
EstimateParser(),
|
||||
InstallParser(),
|
||||
|
||||
32
src/skill_seekers/cli/parsers/word_parser.py
Normal file
32
src/skill_seekers/cli/parsers/word_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Word document subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.word to ensure
|
||||
consistency with the standalone word_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.word import add_word_arguments
|
||||
|
||||
|
||||
class WordParser(SubcommandParser):
|
||||
"""Parser for word subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "word"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from Word document (.docx)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from Word document (.docx) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add word-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with word_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_word_arguments(parser)
|
||||
@@ -319,7 +319,7 @@ class PDFToSkillConverter:
|
||||
code_list = page.get("code_samples") or page.get("code_blocks")
|
||||
if code_list:
|
||||
f.write("### Code Examples\n\n")
|
||||
for code in code_list[:3]: # Limit to top 3
|
||||
for code in code_list:
|
||||
lang = code.get("language", "")
|
||||
f.write(f"```{lang}\n{code['code']}\n```\n\n")
|
||||
|
||||
@@ -721,21 +721,44 @@ def main():
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Traditional Enhancement (complements workflow system)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Note: Runs independently of workflow system (they complement each other)
|
||||
if getattr(args, "enhance_level", 0) > 0:
|
||||
# Traditional AI enhancement (API or LOCAL mode)
|
||||
import os
|
||||
|
||||
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
|
||||
mode = "API" if api_key else "LOCAL"
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("🤖 Traditional AI Enhancement")
|
||||
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
|
||||
print("=" * 80)
|
||||
if workflow_executed:
|
||||
print(f" Running after workflow: {workflow_name}")
|
||||
print(
|
||||
" (Workflow provides specialized analysis, enhancement provides general improvements)"
|
||||
)
|
||||
print(" (Use --enhance-workflow for more control)")
|
||||
print("")
|
||||
# Note: PDF scraper uses enhance_level instead of enhance/enhance_local
|
||||
# This is consistent with the new unified enhancement system
|
||||
|
||||
skill_dir = converter.skill_dir
|
||||
if api_key:
|
||||
try:
|
||||
from skill_seekers.cli.enhance_skill import enhance_skill_md
|
||||
|
||||
enhance_skill_md(skill_dir, api_key)
|
||||
print("✅ API enhancement complete!")
|
||||
except ImportError:
|
||||
print("❌ API enhancement not available. Falling back to LOCAL mode...")
|
||||
from pathlib import Path
|
||||
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
||||
|
||||
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
||||
enhancer.run(headless=True)
|
||||
print("✅ Local enhancement complete!")
|
||||
else:
|
||||
from pathlib import Path
|
||||
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
||||
|
||||
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
||||
enhancer.run(headless=True)
|
||||
print("✅ Local enhancement complete!")
|
||||
|
||||
except RuntimeError as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
|
||||
@@ -60,6 +60,9 @@ class SourceDetector:
|
||||
if source.endswith(".pdf"):
|
||||
return cls._detect_pdf(source)
|
||||
|
||||
if source.endswith(".docx"):
|
||||
return cls._detect_word(source)
|
||||
|
||||
# 2. Directory detection
|
||||
if os.path.isdir(source):
|
||||
return cls._detect_local(source)
|
||||
@@ -85,6 +88,7 @@ class SourceDetector:
|
||||
" GitHub: skill-seekers create facebook/react\n"
|
||||
" Local: skill-seekers create ./my-project\n"
|
||||
" PDF: skill-seekers create tutorial.pdf\n"
|
||||
" DOCX: skill-seekers create document.docx\n"
|
||||
" Config: skill-seekers create configs/react.json"
|
||||
)
|
||||
|
||||
@@ -104,6 +108,14 @@ class SourceDetector:
|
||||
type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_word(cls, source: str) -> SourceInfo:
|
||||
"""Detect Word document (.docx) source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_local(cls, source: str) -> SourceInfo:
|
||||
"""Detect local directory source."""
|
||||
@@ -190,6 +202,13 @@ class SourceDetector:
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
|
||||
elif source_info.type == "word":
|
||||
file_path = source_info.parsed["file_path"]
|
||||
if not os.path.exists(file_path):
|
||||
raise ValueError(f"Word document does not exist: {file_path}")
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
|
||||
elif source_info.type == "config":
|
||||
config_path = source_info.parsed["config_path"]
|
||||
if not os.path.exists(config_path):
|
||||
|
||||
@@ -73,11 +73,12 @@ class UnifiedScraper:
|
||||
"documentation": [], # List of doc sources
|
||||
"github": [], # List of github sources
|
||||
"pdf": [], # List of pdf sources
|
||||
"word": [], # List of word sources
|
||||
"local": [], # List of local sources (docs or code)
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "local": 0}
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
self.name = self.config["name"]
|
||||
@@ -151,6 +152,8 @@ class UnifiedScraper:
|
||||
self._scrape_github(source)
|
||||
elif source_type == "pdf":
|
||||
self._scrape_pdf(source)
|
||||
elif source_type == "word":
|
||||
self._scrape_word(source)
|
||||
elif source_type == "local":
|
||||
self._scrape_local(source)
|
||||
else:
|
||||
@@ -514,6 +517,65 @@ class UnifiedScraper:
|
||||
|
||||
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_word(self, source: dict[str, Any]):
|
||||
"""Scrape Word document (.docx)."""
|
||||
try:
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("word_scraper.py not found")
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this Word source
|
||||
idx = self._source_counters["word"]
|
||||
self._source_counters["word"] += 1
|
||||
|
||||
# Extract Word identifier for unique naming (filename without extension)
|
||||
docx_path = source["path"]
|
||||
docx_id = os.path.splitext(os.path.basename(docx_path))[0]
|
||||
|
||||
# Create config for Word scraper
|
||||
word_config = {
|
||||
"name": f"{self.name}_word_{idx}_{docx_id}",
|
||||
"docx_path": source["path"],
|
||||
"description": f"{source.get('name', docx_id)} documentation",
|
||||
}
|
||||
|
||||
# Scrape
|
||||
logger.info(f"Scraping Word document: {source['path']}")
|
||||
converter = WordToSkillConverter(word_config)
|
||||
|
||||
# Extract Word content
|
||||
converter.extract_docx()
|
||||
|
||||
# Load extracted data from file
|
||||
word_data_file = converter.data_file
|
||||
with open(word_data_file, encoding="utf-8") as f:
|
||||
word_data = json.load(f)
|
||||
|
||||
# Copy data file to cache
|
||||
cache_word_data = os.path.join(self.data_dir, f"word_data_{idx}_{docx_id}.json")
|
||||
shutil.copy(word_data_file, cache_word_data)
|
||||
|
||||
# Append to list
|
||||
self.scraped_data["word"].append(
|
||||
{
|
||||
"docx_path": docx_path,
|
||||
"docx_id": docx_id,
|
||||
"idx": idx,
|
||||
"data": word_data,
|
||||
"data_file": cache_word_data,
|
||||
}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Word: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Word SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")
|
||||
|
||||
def _scrape_local(self, source: dict[str, Any]):
|
||||
"""
|
||||
Scrape local directory (documentation files or source code).
|
||||
|
||||
1054
src/skill_seekers/cli/word_scraper.py
Normal file
1054
src/skill_seekers/cli/word_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user