Files
skill-seekers-reference/src/skill_seekers/cli/word_scraper.py
yusyus 064405c052 fix: resolve 18 bugs and code quality issues across adaptors, CLI, and chunking pipeline
Bug fixes:
- Fix --var flag silently dropped in create routing (args.workflow_var → args.var)
- Fix double _score_code_quality() call in word scraper
- Add .docx file extension validation in WordToSkillConverter
- Fix weaviate ImportError masked by generic Exception handler
- Fix RAG chunking crash using non-existent converter.output_dir

Chunking pipeline improvements:
- Wire --chunk-overlap-tokens through entire package pipeline
  (package_skill → adaptor.package → format_skill_md → _maybe_chunk_content → RAGChunker)
- Add auto-scaling overlap: max(50, chunk_tokens//10) when chunk size is non-default
- Rename --no-preserve-code to --no-preserve-code-blocks (backward-compat alias kept)
- Replace hardcoded 512/50 chunk defaults with DEFAULT_CHUNK_TOKENS/DEFAULT_CHUNK_OVERLAP_TOKENS
  constants across all 12 concrete adaptors, rag_chunker, base, and package_skill

Code quality:
- Extract shared _generate_openai_embeddings() and _generate_st_embeddings() to SkillAdaptor
  base class, removing ~150 lines of duplication from chroma/weaviate/pinecone
- Add Pinecone adaptor with full upload support (pinecone_adaptor.py)

Tests (14 new):
- chunk_overlap_tokens parameter wiring, auto-scaling overlap, preserve_code_blocks flag
- .docx/.doc/no-extension file validation, --var flag routing E2E
- Embedding method inheritance verification, backward-compatible flag aliases

Docs:
- Update CHANGELOG, CLI_REFERENCE, API_REFERENCE, packaging guide (EN+ZH)
- Update README test count badge (1880+ → 2283+)

All 2283 tests passing, 8 skipped, 0 failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 21:57:59 +03:00

1060 lines
40 KiB
Python

#!/usr/bin/env python3
"""
Word Document (.docx) to Claude Skill Converter (Task B2)
Converts Word documents into Claude AI skills.
Uses mammoth for HTML conversion and python-docx for metadata/tables.
Usage:
python3 word_scraper.py --docx document.docx --name myskill
python3 word_scraper.py --from-json document_extracted.json
"""
import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
# Optional dependency guard
try:
import mammoth
import docx as python_docx
WORD_AVAILABLE = True
except ImportError:
WORD_AVAILABLE = False
logger = logging.getLogger(__name__)
def _check_word_deps():
"""Raise RuntimeError if mammoth/python-docx are not installed."""
if not WORD_AVAILABLE:
raise RuntimeError(
"mammoth and python-docx are required for Word document support.\n"
'Install with: pip install "skill-seekers[docx]"\n'
"Or: pip install mammoth python-docx"
)
def infer_description_from_word(metadata: dict = None, name: str = "") -> str:
"""Infer skill description from Word document metadata or name.
Args:
metadata: Document metadata dict with title, subject, etc.
name: Skill name for fallback
Returns:
Description string suitable for "Use when..." format
"""
if metadata:
# Try subject field first
if metadata.get("subject"):
desc = str(metadata["subject"]).strip()
if len(desc) > 20:
if len(desc) > 150:
desc = desc[:147] + "..."
return f"Use when {desc.lower()}"
# Try title if meaningful
if metadata.get("title"):
title = str(metadata["title"]).strip()
if len(title) > 10 and not title.lower().endswith(".docx"):
return f"Use when working with {title.lower()}"
return (
f"Use when referencing {name} documentation"
if name
else "Use when referencing this documentation"
)
class WordToSkillConverter:
"""Convert Word document (.docx) to Claude skill."""
def __init__(self, config):
self.config = config
self.name = config["name"]
self.docx_path = config.get("docx_path", "")
self.description = config.get("description") or f"Use when referencing {self.name} documentation"
# Paths
self.skill_dir = f"output/{self.name}"
self.data_file = f"output/{self.name}_extracted.json"
# Categories config
self.categories = config.get("categories", {})
# Extracted data
self.extracted_data = None
def extract_docx(self):
"""Extract content from Word document using mammoth + python-docx.
- mammoth converts body content to HTML (leverages Word paragraph styles)
- python-docx provides metadata and fine-grained table access
- BeautifulSoup parses the HTML and splits by h1/h2 heading boundaries
- LanguageDetector identifies code language in <code> blocks
"""
_check_word_deps()
from bs4 import BeautifulSoup
from skill_seekers.cli.language_detector import LanguageDetector
print(f"\n🔍 Extracting from Word document: {self.docx_path}")
if not os.path.exists(self.docx_path):
raise FileNotFoundError(f"Word document not found: {self.docx_path}")
if not self.docx_path.lower().endswith(".docx"):
raise ValueError(
f"Not a Word document (expected .docx): {self.docx_path}"
)
# --- Extract metadata via python-docx ---
doc = python_docx.Document(self.docx_path)
core_props = doc.core_properties
metadata = {
"title": core_props.title or "",
"author": core_props.author or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
"subject": core_props.subject or "",
}
# Update description from metadata if not set explicitly
if not self.config.get("description"):
self.description = infer_description_from_word(metadata, self.name)
# --- Convert body to HTML with mammoth ---
with open(self.docx_path, "rb") as f:
result = mammoth.convert_to_html(f)
html_content = result.value
# --- Parse HTML with BeautifulSoup ---
soup = BeautifulSoup(html_content, "html.parser")
# --- Split by h1/h2 heading boundaries into sections ---
sections = []
current_heading = None
current_heading_level = None
current_elements = []
section_number = 0
def _flush_section():
nonlocal section_number
if current_heading is not None or current_elements:
section_number += 1
section = _build_section(
section_number,
current_heading,
current_heading_level,
current_elements,
doc,
)
sections.append(section)
for elem in soup.children:
if not hasattr(elem, "name") or elem.name is None:
continue
if elem.name in ("h1", "h2"):
# Flush previous section
_flush_section()
current_heading = elem.get_text(strip=True)
current_heading_level = elem.name
current_elements = []
else:
current_elements.append(elem)
# Flush last section
_flush_section()
# If no sections were created (no headings), create one default section
if not sections:
section_number = 1
all_elements = [e for e in soup.children if hasattr(e, "name") and e.name]
section = _build_section(
1,
Path(self.docx_path).stem,
"h1",
all_elements,
doc,
)
sections = [section]
# --- Collect language statistics ---
detector = LanguageDetector(min_confidence=0.15)
languages_detected: dict[str, int] = {}
total_code_blocks = 0
for section in sections:
for code_sample in section.get("code_samples", []):
lang = code_sample.get("language", "")
if lang:
languages_detected[lang] = languages_detected.get(lang, 0) + 1
total_code_blocks += 1
# Detect languages for samples without language
for section in sections:
for code_sample in section.get("code_samples", []):
if not code_sample.get("language"):
code = code_sample.get("code", "")
if code:
lang, confidence = detector.detect_from_code(code)
if lang and confidence >= 0.3:
code_sample["language"] = lang
languages_detected[lang] = languages_detected.get(lang, 0) + 1
result_data = {
"source_file": self.docx_path,
"metadata": metadata,
"total_sections": len(sections),
"total_code_blocks": total_code_blocks,
"total_images": sum(len(s.get("images", [])) for s in sections),
"languages_detected": languages_detected,
"pages": sections, # "pages" key for pipeline compatibility
}
# Save extracted data
os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
with open(self.data_file, "w", encoding="utf-8") as f:
json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
print(f"\n💾 Saved extracted data to: {self.data_file}")
self.extracted_data = result_data
print(
f"✅ Extracted {len(sections)} sections, "
f"{total_code_blocks} code blocks, "
f"{result_data['total_images']} images"
)
return True
def load_extracted_data(self, json_path):
"""Load previously extracted data from JSON."""
print(f"\n📂 Loading extracted data from: {json_path}")
with open(json_path, encoding="utf-8") as f:
self.extracted_data = json.load(f)
total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
print(f"✅ Loaded {total} sections")
return True
def categorize_content(self):
"""Categorize sections based on headings or keywords."""
print("\n📋 Categorizing content...")
categorized = {}
sections = self.extracted_data.get("pages", [])
# For single Word source, use single category with all sections
if self.docx_path:
docx_basename = Path(self.docx_path).stem
category_key = self._sanitize_filename(docx_basename)
categorized[category_key] = {
"title": docx_basename,
"pages": sections,
}
print("✅ Created 1 category (single Word source)")
print(f" - {docx_basename}: {len(sections)} sections")
return categorized
# Keyword-based categorization (multi-source scenario)
if self.categories:
first_value = next(iter(self.categories.values()), None)
if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
# Already categorized format
for cat_key, pages in self.categories.items():
categorized[cat_key] = {
"title": cat_key.replace("_", " ").title(),
"pages": pages,
}
else:
# Keyword-based categorization
for cat_key in self.categories:
categorized[cat_key] = {
"title": cat_key.replace("_", " ").title(),
"pages": [],
}
for section in sections:
text = section.get("text", "").lower()
heading_text = section.get("heading", "").lower()
scores = {}
for cat_key, keywords in self.categories.items():
if isinstance(keywords, list):
score = sum(
1
for kw in keywords
if isinstance(kw, str)
and (kw.lower() in text or kw.lower() in heading_text)
)
else:
score = 0
if score > 0:
scores[cat_key] = score
if scores:
best_cat = max(scores, key=scores.get)
categorized[best_cat]["pages"].append(section)
else:
if "other" not in categorized:
categorized["other"] = {"title": "Other", "pages": []}
categorized["other"]["pages"].append(section)
else:
# No categorization - single category
categorized["content"] = {"title": "Content", "pages": sections}
print(f"✅ Created {len(categorized)} categories")
for _cat_key, cat_data in categorized.items():
print(f" - {cat_data['title']}: {len(cat_data['pages'])} sections")
return categorized
def build_skill(self):
"""Build complete skill structure."""
print(f"\n🏗️ Building skill: {self.name}")
# Create directories
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
# Categorize content
categorized = self.categorize_content()
# Generate reference files
print("\n📝 Generating reference files...")
total_sections = len(categorized)
section_num = 1
for cat_key, cat_data in categorized.items():
self._generate_reference_file(cat_key, cat_data, section_num, total_sections)
section_num += 1
# Generate index
self._generate_index(categorized)
# Generate SKILL.md
self._generate_skill_md(categorized)
print(f"\n✅ Skill built successfully: {self.skill_dir}/")
print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
def _generate_reference_file(self, _cat_key, cat_data, section_num, total_sections):
"""Generate a reference markdown file for a category."""
sections = cat_data["pages"]
# Use docx basename for filename
docx_basename = ""
if self.docx_path:
docx_basename = Path(self.docx_path).stem
if sections:
section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
if total_sections == 1:
filename = (
f"{self.skill_dir}/references/{docx_basename}.md"
if docx_basename
else f"{self.skill_dir}/references/main.md"
)
else:
sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
base_name = docx_basename if docx_basename else "section"
filename = f"{self.skill_dir}/references/{base_name}_{sec_range}.md"
else:
filename = f"{self.skill_dir}/references/section_{section_num:02d}.md"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {cat_data['title']}\n\n")
for section in sections:
sec_num = section.get("section_number", "?")
heading = section.get("heading", "")
heading_level = section.get("heading_level", "h1")
f.write(f"---\n\n**📄 Source: Section {sec_num}**\n\n")
# Add heading
if heading:
md_level = "#" * (int(heading_level[1]) + 1) if heading_level else "##"
f.write(f"{md_level} {heading}\n\n")
# Add sub-headings (h3+) found within the section
for sub_heading in section.get("headings", []):
sub_level = sub_heading.get("level", "h3")
sub_text = sub_heading.get("text", "")
if sub_text:
sub_md = "#" * (int(sub_level[1]) + 1) if sub_level else "###"
f.write(f"{sub_md} {sub_text}\n\n")
# Add text content
if section.get("text"):
f.write(f"{section['text']}\n\n")
# Add code samples
code_list = section.get("code_samples", [])
if code_list:
f.write("### Code Examples\n\n")
for code in code_list:
lang = code.get("language", "")
f.write(f"```{lang}\n{code['code']}\n```\n\n")
# Add tables as markdown
tables = section.get("tables", [])
if tables:
f.write("### Tables\n\n")
for table in tables:
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
f.write("| " + " | ".join("---" for _ in headers) + " |\n")
for row in rows:
f.write("| " + " | ".join(str(c) for c in row) + " |\n")
f.write("\n")
# Add images
images = section.get("images", [])
if images:
assets_dir = os.path.join(self.skill_dir, "assets")
os.makedirs(assets_dir, exist_ok=True)
f.write("### Images\n\n")
for img in images:
img_index = img.get("index", 0)
img_data = img.get("data", b"")
img_filename = f"section_{sec_num}_img_{img_index}.png"
img_path = os.path.join(assets_dir, img_filename)
if isinstance(img_data, (bytes, bytearray)):
with open(img_path, "wb") as img_file:
img_file.write(img_data)
f.write(f"![Image {img_index}](../assets/{img_filename})\n\n")
f.write("---\n\n")
print(f" Generated: {filename}")
def _generate_index(self, categorized):
"""Generate reference index."""
filename = f"{self.skill_dir}/references/index.md"
docx_basename = ""
if self.docx_path:
docx_basename = Path(self.docx_path).stem
total_sections = len(categorized)
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {self.name.title()} Documentation Reference\n\n")
f.write("## Categories\n\n")
section_num = 1
for _cat_key, cat_data in categorized.items():
sections = cat_data["pages"]
section_count = len(sections)
if sections:
section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
sec_range_str = f"Sections {min(section_nums)}-{max(section_nums)}"
if total_sections == 1:
link_filename = f"{docx_basename}.md" if docx_basename else "main.md"
else:
sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
base_name = docx_basename if docx_basename else "section"
link_filename = f"{base_name}_{sec_range}.md"
else:
link_filename = f"section_{section_num:02d}.md"
sec_range_str = "N/A"
f.write(
f"- [{cat_data['title']}]({link_filename}) "
f"({section_count} sections, {sec_range_str})\n"
)
section_num += 1
f.write("\n## Statistics\n\n")
f.write(f"- Total sections: {self.extracted_data.get('total_sections', 0)}\n")
f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
# Metadata
metadata = self.extracted_data.get("metadata", {})
if metadata.get("author"):
f.write(f"- Author: {metadata['author']}\n")
if metadata.get("created"):
f.write(f"- Created: {metadata['created']}\n")
print(f" Generated: {filename}")
def _generate_skill_md(self, categorized):
"""Generate main SKILL.md file."""
filename = f"{self.skill_dir}/SKILL.md"
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
desc = self.description[:1024] if len(self.description) > 1024 else self.description
with open(filename, "w", encoding="utf-8") as f:
# YAML frontmatter
f.write("---\n")
f.write(f"name: {skill_name}\n")
f.write(f"description: {desc}\n")
f.write("---\n\n")
f.write(f"# {self.name.title()} Documentation Skill\n\n")
f.write(f"{self.description}\n\n")
# Document metadata
metadata = self.extracted_data.get("metadata", {})
if any(metadata.values()):
f.write("## 📋 Document Information\n\n")
if metadata.get("title"):
f.write(f"**Title:** {metadata['title']}\n\n")
if metadata.get("author"):
f.write(f"**Author:** {metadata['author']}\n\n")
if metadata.get("created"):
f.write(f"**Created:** {metadata['created']}\n\n")
if metadata.get("modified"):
f.write(f"**Modified:** {metadata['modified']}\n\n")
# When to Use
f.write("## 💡 When to Use This Skill\n\n")
f.write("Use this skill when you need to:\n")
f.write(f"- Understand {self.name} concepts and fundamentals\n")
f.write("- Look up API references and technical specifications\n")
f.write("- Find code examples and implementation patterns\n")
f.write("- Review tutorials, guides, and best practices\n")
f.write("- Explore the complete documentation structure\n\n")
# Section Overview
total_sections = self.extracted_data.get("total_sections", 0)
f.write("## 📖 Section Overview\n\n")
f.write(f"**Total Sections:** {total_sections}\n\n")
f.write("**Content Breakdown:**\n\n")
for _cat_key, cat_data in categorized.items():
section_count = len(cat_data["pages"])
f.write(f"- **{cat_data['title']}**: {section_count} sections\n")
f.write("\n")
# Key Concepts from headings
f.write(self._format_key_concepts())
# Quick Reference patterns
f.write("## ⚡ Quick Reference\n\n")
f.write(self._format_patterns_from_content())
# Code examples (top 15, grouped by language)
all_code = []
for section in self.extracted_data.get("pages", []):
all_code.extend(section.get("code_samples", []))
all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
top_code = all_code[:15]
if top_code:
f.write("## 📝 Code Examples\n\n")
f.write("*High-quality examples extracted from documentation*\n\n")
by_lang: dict[str, list] = {}
for code in top_code:
lang = code.get("language", "unknown")
by_lang.setdefault(lang, []).append(code)
for lang in sorted(by_lang.keys()):
examples = by_lang[lang]
f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
for i, code in enumerate(examples[:5], 1):
quality = code.get("quality_score", 0)
code_text = code.get("code", "")
f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
f.write(f"```{lang}\n")
if len(code_text) <= 500:
f.write(code_text)
else:
f.write(code_text[:500] + "\n...")
f.write("\n```\n\n")
# Table Summary (first 5 tables)
all_tables = []
for section in self.extracted_data.get("pages", []):
for table in section.get("tables", []):
all_tables.append((section.get("heading", ""), table))
if all_tables:
f.write("## 📊 Table Summary\n\n")
f.write(f"*{len(all_tables)} table(s) found in document*\n\n")
for section_heading, table in all_tables[:5]:
if section_heading:
f.write(f"**From section: {section_heading}**\n\n")
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
f.write("| " + " | ".join("---" for _ in headers) + " |\n")
for row in rows[:5]:
f.write("| " + " | ".join(str(c) for c in row) + " |\n")
f.write("\n")
# Statistics
f.write("## 📊 Documentation Statistics\n\n")
f.write(f"- **Total Sections**: {total_sections}\n")
f.write(f"- **Code Blocks**: {self.extracted_data.get('total_code_blocks', 0)}\n")
f.write(f"- **Images/Diagrams**: {self.extracted_data.get('total_images', 0)}\n")
f.write(f"- **Tables**: {len(all_tables)}\n")
langs = self.extracted_data.get("languages_detected", {})
if langs:
f.write(f"- **Programming Languages**: {len(langs)}\n\n")
f.write("**Language Breakdown:**\n\n")
for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
f.write(f"- {lang}: {count} examples\n")
f.write("\n")
# Navigation
f.write("## 🗺️ Navigation\n\n")
f.write("**Reference Files:**\n\n")
for _cat_key, cat_data in categorized.items():
cat_file = self._sanitize_filename(cat_data["title"])
f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
f.write("\n")
f.write("See `references/index.md` for complete documentation structure.\n\n")
# Footer
f.write("---\n\n")
f.write("**Generated by Skill Seeker** | Word Document Scraper\n")
with open(filename, encoding="utf-8") as f:
line_count = len(f.read().split("\n"))
print(f" Generated: {filename} ({line_count} lines)")
def _format_key_concepts(self) -> str:
"""Extract key concepts from headings across all sections."""
all_headings = []
for section in self.extracted_data.get("pages", []):
# Main heading
heading = section.get("heading", "").strip()
level = section.get("heading_level", "h1")
if heading and len(heading) > 3:
all_headings.append((level, heading))
# Sub-headings
for sub in section.get("headings", []):
text = sub.get("text", "").strip()
sub_level = sub.get("level", "h3")
if text and len(text) > 3:
all_headings.append((sub_level, text))
if not all_headings:
return ""
content = "## 🔑 Key Concepts\n\n"
content += "*Main topics covered in this documentation*\n\n"
h1_headings = [text for level, text in all_headings if level == "h1"]
h2_headings = [text for level, text in all_headings if level == "h2"]
if h1_headings:
content += "**Major Topics:**\n\n"
for heading in h1_headings[:10]:
content += f"- {heading}\n"
content += "\n"
if h2_headings:
content += "**Subtopics:**\n\n"
for heading in h2_headings[:15]:
content += f"- {heading}\n"
content += "\n"
return content
def _format_patterns_from_content(self) -> str:
"""Extract common patterns from text content."""
patterns = []
pattern_keywords = [
"getting started",
"installation",
"configuration",
"usage",
"api",
"examples",
"tutorial",
"guide",
"best practices",
"troubleshooting",
"faq",
]
for section in self.extracted_data.get("pages", []):
heading_text = section.get("heading", "").lower()
sec_num = section.get("section_number", 0)
for keyword in pattern_keywords:
if keyword in heading_text:
patterns.append(
{
"type": keyword.title(),
"heading": section.get("heading", ""),
"section": sec_num,
}
)
break
if not patterns:
return "*See reference files for detailed content*\n\n"
content = "*Common documentation patterns found:*\n\n"
by_type: dict[str, list] = {}
for pattern in patterns:
ptype = pattern["type"]
by_type.setdefault(ptype, []).append(pattern)
for ptype in sorted(by_type.keys()):
items = by_type[ptype]
content += f"**{ptype}** ({len(items)} sections):\n"
for item in items[:3]:
content += f"- {item['heading']} (section {item['section']})\n"
content += "\n"
return content
def _sanitize_filename(self, name):
"""Convert string to safe filename."""
safe = re.sub(r"[^\w\s-]", "", name.lower())
safe = re.sub(r"[-\s]+", "_", safe)
return safe
# ---------------------------------------------------------------------------
# HTML-to-sections helper (module-level for clarity)
# ---------------------------------------------------------------------------
def _build_section(
section_number: int,
heading: str | None,
heading_level: str | None,
elements: list,
doc,
) -> dict:
"""Build a section dict from a list of BeautifulSoup elements.
Args:
section_number: 1-based section index
heading: Heading text (or None for preamble)
heading_level: 'h1', 'h2', etc.
elements: List of BeautifulSoup Tag objects belonging to this section
doc: python-docx Document (used for table cross-reference, not currently used)
Returns:
Section dict compatible with the intermediate JSON format
"""
text_parts = []
code_samples = []
tables = []
sub_headings = []
images = []
for elem in elements:
if not hasattr(elem, "name") or elem.name is None:
continue
tag = elem.name
# Sub-headings (h3, h4, h5, h6) within the section
if tag in ("h3", "h4", "h5", "h6"):
sub_text = elem.get_text(strip=True)
if sub_text:
sub_headings.append({"level": tag, "text": sub_text})
continue
# Code blocks
if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None):
code_elem = elem.find("code") if tag == "pre" else elem
if code_elem:
code_text = code_elem.get_text()
else:
code_text = elem.get_text()
code_text = code_text.strip()
if code_text:
# Try to detect language from class attribute
classes = (code_elem or elem).get("class", [])
lang = ""
for cls in classes:
if cls.startswith("language-") or cls.startswith("lang-"):
lang = cls.split("-", 1)[1]
break
quality_score = _score_code_quality(code_text)
code_samples.append(
{"code": code_text, "language": lang, "quality_score": quality_score}
)
continue
# Tables
if tag == "table":
table_data = _extract_table_from_html(elem)
if table_data:
tables.append(table_data)
continue
# Images
if tag == "img":
# mammoth embeds images as data URIs; extract if present
src = elem.get("src", "")
if src.startswith("data:"):
import base64
try:
header, b64data = src.split(",", 1)
img_bytes = base64.b64decode(b64data)
images.append(
{
"index": len(images),
"data": img_bytes,
"width": int(elem.get("width", 0) or 0),
"height": int(elem.get("height", 0) or 0),
}
)
except Exception:
pass
continue
# Detect code in <p> elements that contain <br> tags (multi-line content)
# Mammoth renders monospace/Courier paragraphs as <p> with <br> — not <pre>
if tag == "p" and elem.find("br"):
raw_text = elem.get_text(separator="\n").strip()
# Exclude bullet-point / prose lists (•, *, -)
if raw_text and not re.search(r"^[•\-\*]\s", raw_text, re.MULTILINE):
quality_score = _score_code_quality(raw_text)
if quality_score >= 5.5:
code_samples.append(
{"code": raw_text, "language": "", "quality_score": quality_score}
)
continue
# Regular text/paragraph content
text = elem.get_text(separator=" ", strip=True)
if text:
text_parts.append(text)
return {
"section_number": section_number,
"heading": heading or "",
"heading_level": heading_level or "h1",
"text": "\n\n".join(text_parts),
"headings": sub_headings,
"code_samples": code_samples,
"tables": tables,
"images": images,
}
def _extract_table_from_html(table_elem) -> dict | None:
"""Extract headers and rows from a BeautifulSoup <table> element."""
headers = []
rows = []
# Try <thead> first for headers
thead = table_elem.find("thead")
if thead:
header_row = thead.find("tr")
if header_row:
headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]
# Body rows
tbody = table_elem.find("tbody") or table_elem
for row in tbody.find_all("tr"):
cells = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
# Skip the header row we already captured
if cells and cells != headers:
rows.append(cells)
# If no explicit thead, use first row as header
if not headers and rows:
headers = rows.pop(0)
if not headers and not rows:
return None
return {"headers": headers, "rows": rows}
def _score_code_quality(code: str) -> float:
"""Simple quality heuristic for code blocks (0-10 scale)."""
if not code:
return 0.0
score = 5.0
lines = code.strip().split("\n")
line_count = len(lines)
# More lines = more substantial
if line_count >= 10:
score += 2.0
elif line_count >= 5:
score += 1.0
# Has function/class definitions
if re.search(r"\b(def |class |function |func |fn )", code):
score += 1.5
# Has imports/require
if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
score += 0.5
# Has indentation (common in Python, JS, etc.)
if re.search(r"^ ", code, re.MULTILINE):
score += 0.5
# Has assignment, operators, or common code syntax
if re.search(r"[=:{}()\[\]]", code):
score += 0.3
# Very short snippets get penalized
if len(code) < 30:
score -= 2.0
return min(10.0, max(0.0, score))
def main():
from .arguments.word import add_word_arguments
parser = argparse.ArgumentParser(
description="Convert Word document (.docx) to Claude skill",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
add_word_arguments(parser)
args = parser.parse_args()
# Set logging level
if getattr(args, "quiet", False):
logging.getLogger().setLevel(logging.WARNING)
elif getattr(args, "verbose", False):
logging.getLogger().setLevel(logging.DEBUG)
# Handle --dry-run
if getattr(args, "dry_run", False):
source = getattr(args, "docx", None) or getattr(args, "from_json", None) or "(none)"
print(f"\n{'=' * 60}")
print("DRY RUN: Word Document Extraction")
print(f"{'=' * 60}")
print(f"Source: {source}")
print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}")
print(f"Enhance level: {getattr(args, 'enhance_level', 0)}")
print(f"\n✅ Dry run complete")
return 0
# Validate inputs
if not (getattr(args, "docx", None) or getattr(args, "from_json", None)):
parser.error("Must specify --docx or --from-json")
# Build from JSON workflow
if getattr(args, "from_json", None):
name = Path(args.from_json).stem.replace("_extracted", "")
config = {
"name": getattr(args, "name", None) or name,
"description": getattr(args, "description", None) or f"Use when referencing {name} documentation",
}
try:
converter = WordToSkillConverter(config)
converter.load_extracted_data(args.from_json)
converter.build_skill()
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
return 0
# Direct DOCX mode
if not getattr(args, "name", None):
# Auto-detect name from filename
args.name = Path(args.docx).stem
config = {
"name": args.name,
"docx_path": args.docx,
# Pass None so extract_docx() can infer from document metadata (subject/title)
"description": getattr(args, "description", None),
}
if getattr(args, "categories", None):
config["categories"] = args.categories
try:
converter = WordToSkillConverter(config)
# Extract
if not converter.extract_docx():
print("\n❌ Word extraction failed - see error above", file=sys.stderr)
sys.exit(1)
# Build skill
converter.build_skill()
# Enhancement Workflow Integration
from skill_seekers.cli.workflow_runner import run_workflows
workflow_executed, workflow_names = run_workflows(args)
workflow_name = ", ".join(workflow_names) if workflow_names else None
# Traditional enhancement (complements workflow system)
if getattr(args, "enhance_level", 0) > 0:
import os
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
mode = "API" if api_key else "LOCAL"
print("\n" + "=" * 80)
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
print("=" * 80)
if workflow_executed:
print(f" Running after workflow: {workflow_name}")
print(
" (Workflow provides specialized analysis, enhancement provides general improvements)"
)
print("")
skill_dir = converter.skill_dir
if api_key:
try:
from skill_seekers.cli.enhance_skill import enhance_skill_md
enhance_skill_md(skill_dir, api_key)
print("✅ API enhancement complete!")
except ImportError:
print("❌ API enhancement not available. Falling back to LOCAL mode...")
from pathlib import Path
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
else:
from pathlib import Path
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
except RuntimeError as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
return 0
if __name__ == "__main__":
sys.exit(main())