diff --git a/AGENTS.md b/AGENTS.md index 6a59bfe..553288a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,10 +12,12 @@ This file provides essential guidance for AI coding agents working with the Skil | Attribute | Value | |-----------|-------| -| **Current Version** | 3.0.0 | +| **Current Version** | 3.1.3 | | **Python Version** | 3.10+ (tested on 3.10, 3.11, 3.12, 3.13) | | **License** | MIT | | **Package Name** | `skill-seekers` (PyPI) | +| **Source Files** | 169 Python files | +| **Test Files** | 101 test files | | **Website** | https://skillseekersweb.com/ | | **Repository** | https://github.com/yusufkaraaslan/Skill_Seekers | @@ -55,7 +57,7 @@ This file provides essential guidance for AI coding agents working with the Skil ``` /mnt/1ece809a-2821-4f10-aecb-fcdf34760c0b/Git/Skill_Seekers/ ├── src/skill_seekers/ # Main source code (src/ layout) -│ ├── cli/ # CLI tools and commands (~42k lines) +│ ├── cli/ # CLI tools and commands (~70 modules) │ │ ├── adaptors/ # Platform adaptors (Strategy pattern) │ │ │ ├── base.py # Abstract base class (SkillAdaptor) │ │ │ ├── claude.py # Claude AI adaptor @@ -70,12 +72,6 @@ This file provides essential guidance for AI coding agents working with the Skil │ │ │ ├── qdrant.py # Qdrant vector DB adaptor │ │ │ ├── weaviate.py # Weaviate vector DB adaptor │ │ │ └── streaming_adaptor.py # Streaming output adaptor -│ │ ├── storage/ # Cloud storage backends -│ │ │ ├── base_storage.py # Storage interface -│ │ │ ├── s3_storage.py # AWS S3 support -│ │ │ ├── gcs_storage.py # Google Cloud Storage -│ │ │ └── azure_storage.py # Azure Blob Storage -│ │ ├── parsers/ # CLI argument parsers │ │ ├── arguments/ # CLI argument definitions │ │ ├── presets/ # Preset configuration management │ │ ├── main.py # Unified CLI entry point @@ -85,6 +81,7 @@ This file provides essential guidance for AI coding agents working with the Skil │ │ ├── pdf_scraper.py # PDF extraction │ │ ├── unified_scraper.py # Multi-source scraping │ │ ├── codebase_scraper.py # Local codebase analysis +│ │ ├── enhance_command.py # AI enhancement command │ │ ├── enhance_skill_local.py # AI enhancement (local mode) │ │ ├── package_skill.py # Skill packager │ │ ├── upload_skill.py # Upload to platforms @@ -101,8 +98,8 @@ This file provides essential guidance for AI coding agents working with the Skil │ │ ├── source_manager.py # Config source management │ │ └── tools/ # MCP tool implementations │ │ ├── config_tools.py # Configuration tools -│ │ ├── scraping_tools.py # Scraping tools │ │ ├── packaging_tools.py # Packaging tools +│ │ ├── scraping_tools.py # Scraping tools │ │ ├── source_tools.py # Source management tools │ │ ├── splitting_tools.py # Config splitting tools │ │ ├── vector_db_tools.py # Vector database tools @@ -124,7 +121,7 @@ This file provides essential guidance for AI coding agents working with the Skil │ ├── workflows/ # YAML workflow presets │ ├── _version.py # Version information (reads from pyproject.toml) │ └── __init__.py # Package init -├── tests/ # Test suite (98 test files) +├── tests/ # Test suite (101 test files) ├── configs/ # Preset configuration files ├── docs/ # Documentation (80+ markdown files) │ ├── integrations/ # Platform integration guides @@ -134,17 +131,6 @@ This file provides essential guidance for AI coding agents working with the Skil │ ├── blog/ # Blog posts │ └── roadmap/ # Roadmap documents ├── examples/ # Usage examples -│ ├── langchain-rag-pipeline/ # LangChain example -│ ├── llama-index-query-engine/ # LlamaIndex example -│ ├── pinecone-upsert/ # Pinecone example -│ ├── chroma-example/ # Chroma example -│ ├── weaviate-example/ # Weaviate example -│ ├── qdrant-example/ # Qdrant example -│ ├── faiss-example/ # FAISS example -│ ├── haystack-pipeline/ # Haystack example -│ ├── cursor-react-skill/ # Cursor IDE example -│ ├── windsurf-fastapi-context/ # Windsurf example -│ └── continue-dev-universal/ # Continue.dev example ├── .github/workflows/ # CI/CD workflows ├── pyproject.toml # Main project configuration ├── requirements.txt # Pinned dependencies @@ -259,7 +245,7 @@ pytest tests/ -v -m "not slow and not integration" ### Test Architecture -- **98 test files** covering all features +- **101 test files** covering all features - **1880+ tests** passing - CI Matrix: Ubuntu + macOS, Python 3.10-3.12 - Test markers defined in `pyproject.toml`: @@ -316,22 +302,19 @@ mypy src/skill_seekers --show-error-codes --pretty - **Ignored rules:** E501, F541, ARG002, B007, I001, SIM114 - **Import sorting:** isort style with `skill_seekers` as first-party -### MyPy Configuration (from mypy.ini) +### MyPy Configuration (from pyproject.toml) -```ini -[mypy] -python_version = 3.10 -warn_return_any = False -warn_unused_configs = True -disallow_untyped_defs = False -check_untyped_defs = True -ignore_missing_imports = True -no_implicit_optional = True -show_error_codes = True - -# Gradual typing - be lenient for now -disallow_incomplete_defs = False -disallow_untyped_calls = False +```toml +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = true +ignore_missing_imports = true +show_error_codes = true +pretty = true ``` ### Code Conventions @@ -662,17 +645,6 @@ Preset configs are in `configs/` directory: - `astrovalley_unified.json` - Astrovalley - `configs/integrations/` - Integration-specific configs -### Configuration Documentation - -Preset configs are in `configs/` directory: -- `godot.json` - Godot Engine -- `blender.json` / `blender-unified.json` - Blender Engine -- `claude-code.json` - Claude Code -- `httpx_comprehensive.json` - HTTPX library -- `medusa-mercurjs.json` - Medusa/MercurJS -- `astrovalley_unified.json` - Astrovalley -- `configs/integrations/` - Integration-specific configs - --- ## Key Dependencies @@ -700,6 +672,8 @@ Preset configs are in `configs/` directory: | `python-dotenv` | >=1.1.1 | Environment variables | | `jsonschema` | >=4.25.1 | JSON validation | | `PyYAML` | >=6.0 | YAML parsing | +| `langchain` | >=1.2.10 | LangChain integration | +| `llama-index` | >=0.14.15 | LlamaIndex integration | ### Optional Dependencies @@ -852,4 +826,4 @@ Skill Seekers uses JSON configuration files to define scraping targets. Example *This document is maintained for AI coding agents. For human contributors, see README.md and CONTRIBUTING.md.* -*Last updated: 2026-02-16* +*Last updated: 2026-02-24* diff --git a/pyproject.toml b/pyproject.toml index f6b9430..0c2a3ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,12 @@ azure = [ "azure-storage-blob>=12.19.0", ] +# Word document (.docx) support +docx = [ + "mammoth>=1.6.0", + "python-docx>=1.1.0", +] + # RAG vector database upload support chroma = [ "chromadb>=0.4.0", @@ -146,6 +152,8 @@ embedding = [ # All optional dependencies combined (dev dependencies now in [dependency-groups]) all = [ + "mammoth>=1.6.0", + "python-docx>=1.1.0", "mcp>=1.25,<2", "httpx>=0.28.1", "httpx-sse>=0.4.3", @@ -186,6 +194,7 @@ skill-seekers-resume = "skill_seekers.cli.resume_command:main" skill-seekers-scrape = "skill_seekers.cli.doc_scraper:main" skill-seekers-github = "skill_seekers.cli.github_scraper:main" skill-seekers-pdf = "skill_seekers.cli.pdf_scraper:main" +skill-seekers-word = "skill_seekers.cli.word_scraper:main" skill-seekers-unified = "skill_seekers.cli.unified_scraper:main" skill-seekers-enhance = "skill_seekers.cli.enhance_command:main" skill-seekers-enhance-status = "skill_seekers.cli.enhance_status:main" diff --git a/src/skill_seekers/cli/arguments/__init__.py b/src/skill_seekers/cli/arguments/__init__.py index 929b36e..128e22c 100644 --- a/src/skill_seekers/cli/arguments/__init__.py +++ b/src/skill_seekers/cli/arguments/__init__.py @@ -21,6 +21,7 @@ from .common import add_common_arguments, COMMON_ARGUMENTS from .scrape import add_scrape_arguments, SCRAPE_ARGUMENTS from .github import add_github_arguments, GITHUB_ARGUMENTS from .pdf import add_pdf_arguments, PDF_ARGUMENTS +from .word import add_word_arguments, WORD_ARGUMENTS from .analyze import add_analyze_arguments, ANALYZE_ARGUMENTS from .unified import add_unified_arguments, UNIFIED_ARGUMENTS from .package import add_package_arguments, PACKAGE_ARGUMENTS @@ -38,11 +39,13 @@ __all__ = [ "add_package_arguments", "add_upload_arguments", "add_enhance_arguments", + "add_word_arguments", # Data "COMMON_ARGUMENTS", "SCRAPE_ARGUMENTS", "GITHUB_ARGUMENTS", "PDF_ARGUMENTS", + "WORD_ARGUMENTS", "ANALYZE_ARGUMENTS", "UNIFIED_ARGUMENTS", "PACKAGE_ARGUMENTS", diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py index c1aa7ad..03b30c7 100644 --- a/src/skill_seekers/cli/arguments/create.py +++ b/src/skill_seekers/cli/arguments/create.py @@ -389,6 +389,18 @@ PDF_ARGUMENTS: dict[str, dict[str, Any]] = { }, } +# Word document specific (from word.py) +WORD_ARGUMENTS: dict[str, dict[str, Any]] = { + "docx": { + "flags": ("--docx",), + "kwargs": { + "type": str, + "help": "DOCX file path", + "metavar": "PATH", + }, + }, +} + # Multi-source config specific (from unified_scraper.py) CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = { "merge_mode": { @@ -471,6 +483,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]] "github": GITHUB_ARGUMENTS, "local": LOCAL_ARGUMENTS, "pdf": PDF_ARGUMENTS, + "word": WORD_ARGUMENTS, "config": CONFIG_ARGUMENTS, } return source_args.get(source_type, {}) @@ -507,12 +520,13 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") - 'github': Universal + github-specific - 'local': Universal + local-specific - 'pdf': Universal + pdf-specific + - 'word': Universal + word-specific - 'advanced': Advanced/rare arguments - 'all': All 120+ arguments Args: parser: ArgumentParser to add arguments to - mode: Help mode (default, web, github, local, pdf, advanced, all) + mode: Help mode (default, web, github, local, pdf, word, advanced, all) """ # Positional argument for source parser.add_argument( @@ -543,6 +557,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") for arg_name, arg_def in PDF_ARGUMENTS.items(): parser.add_argument(*arg_def["flags"], **arg_def["kwargs"]) + if mode in ["word", "all"]: + for arg_name, arg_def in WORD_ARGUMENTS.items(): + parser.add_argument(*arg_def["flags"], **arg_def["kwargs"]) + if mode in ["config", "all"]: for arg_name, arg_def in CONFIG_ARGUMENTS.items(): parser.add_argument(*arg_def["flags"], **arg_def["kwargs"]) diff --git a/src/skill_seekers/cli/arguments/word.py b/src/skill_seekers/cli/arguments/word.py new file mode 100644 index 0000000..0c254b2 --- /dev/null +++ b/src/skill_seekers/cli/arguments/word.py @@ -0,0 +1,66 @@ +"""Word document command argument definitions. + +This module defines ALL arguments for the word command in ONE place. +Both word_scraper.py (standalone) and parsers/word_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# Word-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +WORD_ARGUMENTS: dict[str, dict[str, Any]] = { + "docx": { + "flags": ("--docx",), + "kwargs": { + "type": str, + "help": "Direct DOCX file path", + "metavar": "PATH", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_word_arguments(parser: argparse.ArgumentParser) -> None: + """Add all word command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds Word-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for Word. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for Word + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for Word), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)" + ) + + # Word-specific args + for arg_name, arg_def in WORD_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py index ac9badc..15e68a8 100644 --- a/src/skill_seekers/cli/create_command.py +++ b/src/skill_seekers/cli/create_command.py @@ -131,6 +131,8 @@ class CreateCommand: return self._route_local() elif self.source_info.type == "pdf": return self._route_pdf() + elif self.source_info.type == "word": + return self._route_word() elif self.source_info.type == "config": return self._route_config() else: @@ -320,6 +322,29 @@ class CreateCommand: finally: sys.argv = original_argv + def _route_word(self) -> int: + """Route to Word document scraper (word_scraper.py).""" + from skill_seekers.cli import word_scraper + + # Reconstruct argv for word_scraper + argv = ["word_scraper"] + + # Add DOCX file + file_path = self.source_info.parsed["file_path"] + argv.extend(["--docx", file_path]) + + # Add universal arguments + self._add_common_args(argv) + + # Call word_scraper with modified argv + logger.debug(f"Calling word_scraper with argv: {argv}") + original_argv = sys.argv + try: + sys.argv = argv + return word_scraper.main() + finally: + sys.argv = original_argv + def _route_config(self) -> int: """Route to unified scraper for config files (unified_scraper.py).""" from skill_seekers.cli import unified_scraper @@ -442,6 +467,7 @@ Examples: GitHub: skill-seekers create facebook/react -p standard Local: skill-seekers create ./my-project -p comprehensive PDF: skill-seekers create tutorial.pdf --ocr + DOCX: skill-seekers create document.docx Config: skill-seekers create configs/react.json Source Auto-Detection: @@ -449,6 +475,7 @@ Source Auto-Detection: • owner/repo → GitHub analysis • ./path → local codebase • file.pdf → PDF extraction + • file.docx → Word document extraction • file.json → multi-source config Progressive Help (13 → 120+ flags): @@ -483,6 +510,9 @@ Common Workflows: "--help-local", action="store_true", help=argparse.SUPPRESS, dest="_help_local" ) parser.add_argument("--help-pdf", action="store_true", help=argparse.SUPPRESS, dest="_help_pdf") + parser.add_argument( + "--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word" + ) parser.add_argument( "--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config" ) @@ -532,6 +562,15 @@ Common Workflows: add_create_arguments(parser_pdf, mode="pdf") parser_pdf.print_help() return 0 + elif args._help_word: + parser_word = argparse.ArgumentParser( + prog="skill-seekers create", + description="Create skill from Word document (.docx)", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + add_create_arguments(parser_word, mode="word") + parser_word.print_help() + return 0 elif args._help_config: parser_config = argparse.ArgumentParser( prog="skill-seekers create", diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index 3e1cc88..ebf10e2 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -1296,14 +1296,14 @@ Use this skill when you need to: closed_issues = [i for i in issues if i["state"] == "closed"] content += f"## Open Issues ({len(open_issues)})\n\n" - for issue in open_issues[:20]: + for issue in open_issues: labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n" content += f"[View on GitHub]({issue['url']})\n\n" content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n" - for issue in closed_issues[:10]: + for issue in closed_issues: labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels" content += f"### #{issue['number']}: {issue['title']}\n" content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n" diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index 7c9bf80..fb0a478 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -47,6 +47,7 @@ COMMAND_MODULES = { "scrape": "skill_seekers.cli.doc_scraper", "github": "skill_seekers.cli.github_scraper", "pdf": "skill_seekers.cli.pdf_scraper", + "word": "skill_seekers.cli.word_scraper", "unified": "skill_seekers.cli.unified_scraper", "enhance": "skill_seekers.cli.enhance_command", "enhance-status": "skill_seekers.cli.enhance_status", diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py index 93143f8..d12c2a6 100644 --- a/src/skill_seekers/cli/parsers/__init__.py +++ b/src/skill_seekers/cli/parsers/__init__.py @@ -12,6 +12,7 @@ from .config_parser import ConfigParser from .scrape_parser import ScrapeParser from .github_parser import GitHubParser from .pdf_parser import PDFParser +from .word_parser import WordParser from .unified_parser import UnifiedParser from .enhance_parser import EnhanceParser from .enhance_status_parser import EnhanceStatusParser @@ -41,6 +42,7 @@ PARSERS = [ EnhanceParser(), EnhanceStatusParser(), PDFParser(), + WordParser(), UnifiedParser(), EstimateParser(), InstallParser(), diff --git a/src/skill_seekers/cli/parsers/word_parser.py b/src/skill_seekers/cli/parsers/word_parser.py new file mode 100644 index 0000000..2f174f2 --- /dev/null +++ b/src/skill_seekers/cli/parsers/word_parser.py @@ -0,0 +1,32 @@ +"""Word document subcommand parser. + +Uses shared argument definitions from arguments.word to ensure +consistency with the standalone word_scraper module. +""" + +from .base import SubcommandParser +from skill_seekers.cli.arguments.word import add_word_arguments + + +class WordParser(SubcommandParser): + """Parser for word subcommand.""" + + @property + def name(self) -> str: + return "word" + + @property + def help(self) -> str: + return "Extract from Word document (.docx)" + + @property + def description(self) -> str: + return "Extract content from Word document (.docx) and generate skill" + + def add_arguments(self, parser): + """Add word-specific arguments. + + Uses shared argument definitions to ensure consistency + with word_scraper.py (standalone scraper). + """ + add_word_arguments(parser) diff --git a/src/skill_seekers/cli/pdf_scraper.py b/src/skill_seekers/cli/pdf_scraper.py index 9ffd60f..7328ea4 100644 --- a/src/skill_seekers/cli/pdf_scraper.py +++ b/src/skill_seekers/cli/pdf_scraper.py @@ -319,7 +319,7 @@ class PDFToSkillConverter: code_list = page.get("code_samples") or page.get("code_blocks") if code_list: f.write("### Code Examples\n\n") - for code in code_list[:3]: # Limit to top 3 + for code in code_list: lang = code.get("language", "") f.write(f"```{lang}\n{code['code']}\n```\n\n") @@ -721,21 +721,44 @@ def main(): # ═══════════════════════════════════════════════════════════════════════════ # Traditional Enhancement (complements workflow system) # ═══════════════════════════════════════════════════════════════════════════ - # Note: Runs independently of workflow system (they complement each other) if getattr(args, "enhance_level", 0) > 0: - # Traditional AI enhancement (API or LOCAL mode) + import os + + api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") + mode = "API" if api_key else "LOCAL" + print("\n" + "=" * 80) - print("🤖 Traditional AI Enhancement") + print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") print("=" * 80) if workflow_executed: print(f" Running after workflow: {workflow_name}") print( " (Workflow provides specialized analysis, enhancement provides general improvements)" ) - print(" (Use --enhance-workflow for more control)") print("") - # Note: PDF scraper uses enhance_level instead of enhance/enhance_local - # This is consistent with the new unified enhancement system + + skill_dir = converter.skill_dir + if api_key: + try: + from skill_seekers.cli.enhance_skill import enhance_skill_md + + enhance_skill_md(skill_dir, api_key) + print("✅ API enhancement complete!") + except ImportError: + print("❌ API enhancement not available. Falling back to LOCAL mode...") + from pathlib import Path + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") + else: + from pathlib import Path + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") except RuntimeError as e: print(f"\n❌ Error: {e}", file=sys.stderr) diff --git a/src/skill_seekers/cli/source_detector.py b/src/skill_seekers/cli/source_detector.py index 8f98408..7f2397c 100644 --- a/src/skill_seekers/cli/source_detector.py +++ b/src/skill_seekers/cli/source_detector.py @@ -60,6 +60,9 @@ class SourceDetector: if source.endswith(".pdf"): return cls._detect_pdf(source) + if source.endswith(".docx"): + return cls._detect_word(source) + # 2. Directory detection if os.path.isdir(source): return cls._detect_local(source) @@ -85,6 +88,7 @@ class SourceDetector: " GitHub: skill-seekers create facebook/react\n" " Local: skill-seekers create ./my-project\n" " PDF: skill-seekers create tutorial.pdf\n" + " DOCX: skill-seekers create document.docx\n" " Config: skill-seekers create configs/react.json" ) @@ -104,6 +108,14 @@ class SourceDetector: type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source ) + @classmethod + def _detect_word(cls, source: str) -> SourceInfo: + """Detect Word document (.docx) source.""" + name = os.path.splitext(os.path.basename(source))[0] + return SourceInfo( + type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source + ) + @classmethod def _detect_local(cls, source: str) -> SourceInfo: """Detect local directory source.""" @@ -190,6 +202,13 @@ class SourceDetector: if not os.path.isfile(file_path): raise ValueError(f"Path is not a file: {file_path}") + elif source_info.type == "word": + file_path = source_info.parsed["file_path"] + if not os.path.exists(file_path): + raise ValueError(f"Word document does not exist: {file_path}") + if not os.path.isfile(file_path): + raise ValueError(f"Path is not a file: {file_path}") + elif source_info.type == "config": config_path = source_info.parsed["config_path"] if not os.path.exists(config_path): diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index 7322a9f..092c218 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -73,11 +73,12 @@ class UnifiedScraper: "documentation": [], # List of doc sources "github": [], # List of github sources "pdf": [], # List of pdf sources + "word": [], # List of word sources "local": [], # List of local sources (docs or code) } # Track source index for unique naming (multi-source support) - self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "local": 0} + self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0} # Output paths - cleaner organization self.name = self.config["name"] @@ -151,6 +152,8 @@ class UnifiedScraper: self._scrape_github(source) elif source_type == "pdf": self._scrape_pdf(source) + elif source_type == "word": + self._scrape_word(source) elif source_type == "local": self._scrape_local(source) else: @@ -514,6 +517,65 @@ class UnifiedScraper: logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted") + def _scrape_word(self, source: dict[str, Any]): + """Scrape Word document (.docx).""" + try: + from skill_seekers.cli.word_scraper import WordToSkillConverter + except ImportError: + logger.error("word_scraper.py not found") + return + + # Multi-source support: Get unique index for this Word source + idx = self._source_counters["word"] + self._source_counters["word"] += 1 + + # Extract Word identifier for unique naming (filename without extension) + docx_path = source["path"] + docx_id = os.path.splitext(os.path.basename(docx_path))[0] + + # Create config for Word scraper + word_config = { + "name": f"{self.name}_word_{idx}_{docx_id}", + "docx_path": source["path"], + "description": f"{source.get('name', docx_id)} documentation", + } + + # Scrape + logger.info(f"Scraping Word document: {source['path']}") + converter = WordToSkillConverter(word_config) + + # Extract Word content + converter.extract_docx() + + # Load extracted data from file + word_data_file = converter.data_file + with open(word_data_file, encoding="utf-8") as f: + word_data = json.load(f) + + # Copy data file to cache + cache_word_data = os.path.join(self.data_dir, f"word_data_{idx}_{docx_id}.json") + shutil.copy(word_data_file, cache_word_data) + + # Append to list + self.scraped_data["word"].append( + { + "docx_path": docx_path, + "docx_id": docx_id, + "idx": idx, + "data": word_data, + "data_file": cache_word_data, + } + ) + + # Build standalone SKILL.md for synthesis + try: + converter.build_skill() + logger.info("✅ Word: Standalone SKILL.md created") + except Exception as e: + logger.warning(f"⚠️ Failed to build standalone Word SKILL.md: {e}") + + logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted") + def _scrape_local(self, source: dict[str, Any]): """ Scrape local directory (documentation files or source code). diff --git a/src/skill_seekers/cli/word_scraper.py b/src/skill_seekers/cli/word_scraper.py new file mode 100644 index 0000000..76d068f --- /dev/null +++ b/src/skill_seekers/cli/word_scraper.py @@ -0,0 +1,1054 @@ +#!/usr/bin/env python3 +""" +Word Document (.docx) to Claude Skill Converter (Task B2) + +Converts Word documents into Claude AI skills. +Uses mammoth for HTML conversion and python-docx for metadata/tables. + +Usage: + python3 word_scraper.py --docx document.docx --name myskill + python3 word_scraper.py --from-json document_extracted.json +""" + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path + +# Optional dependency guard +try: + import mammoth + import docx as python_docx + + WORD_AVAILABLE = True +except ImportError: + WORD_AVAILABLE = False + +logger = logging.getLogger(__name__) + + +def _check_word_deps(): + """Raise RuntimeError if mammoth/python-docx are not installed.""" + if not WORD_AVAILABLE: + raise RuntimeError( + "mammoth and python-docx are required for Word document support.\n" + 'Install with: pip install "skill-seekers[docx]"\n' + "Or: pip install mammoth python-docx" + ) + + +def infer_description_from_word(metadata: dict = None, name: str = "") -> str: + """Infer skill description from Word document metadata or name. + + Args: + metadata: Document metadata dict with title, subject, etc. + name: Skill name for fallback + + Returns: + Description string suitable for "Use when..." format + """ + if metadata: + # Try subject field first + if metadata.get("subject"): + desc = str(metadata["subject"]).strip() + if len(desc) > 20: + if len(desc) > 150: + desc = desc[:147] + "..." + return f"Use when {desc.lower()}" + + # Try title if meaningful + if metadata.get("title"): + title = str(metadata["title"]).strip() + if len(title) > 10 and not title.lower().endswith(".docx"): + return f"Use when working with {title.lower()}" + + return ( + f"Use when referencing {name} documentation" + if name + else "Use when referencing this documentation" + ) + + +class WordToSkillConverter: + """Convert Word document (.docx) to Claude skill.""" + + def __init__(self, config): + self.config = config + self.name = config["name"] + self.docx_path = config.get("docx_path", "") + self.description = config.get("description") or f"Use when referencing {self.name} documentation" + + # Paths + self.skill_dir = f"output/{self.name}" + self.data_file = f"output/{self.name}_extracted.json" + + # Categories config + self.categories = config.get("categories", {}) + + # Extracted data + self.extracted_data = None + + def extract_docx(self): + """Extract content from Word document using mammoth + python-docx. + + - mammoth converts body content to HTML (leverages Word paragraph styles) + - python-docx provides metadata and fine-grained table access + - BeautifulSoup parses the HTML and splits by h1/h2 heading boundaries + - LanguageDetector identifies code language in blocks + """ + _check_word_deps() + + from bs4 import BeautifulSoup + from skill_seekers.cli.language_detector import LanguageDetector + + print(f"\n🔍 Extracting from Word document: {self.docx_path}") + + if not os.path.exists(self.docx_path): + raise FileNotFoundError(f"Word document not found: {self.docx_path}") + + # --- Extract metadata via python-docx --- + doc = python_docx.Document(self.docx_path) + core_props = doc.core_properties + metadata = { + "title": core_props.title or "", + "author": core_props.author or "", + "created": str(core_props.created) if core_props.created else "", + "modified": str(core_props.modified) if core_props.modified else "", + "subject": core_props.subject or "", + } + + # Update description from metadata if not set explicitly + if not self.config.get("description"): + self.description = infer_description_from_word(metadata, self.name) + + # --- Convert body to HTML with mammoth --- + with open(self.docx_path, "rb") as f: + result = mammoth.convert_to_html(f) + + html_content = result.value + + # --- Parse HTML with BeautifulSoup --- + soup = BeautifulSoup(html_content, "html.parser") + + # --- Split by h1/h2 heading boundaries into sections --- + sections = [] + current_heading = None + current_heading_level = None + current_elements = [] + section_number = 0 + + def _flush_section(): + nonlocal section_number + if current_heading is not None or current_elements: + section_number += 1 + section = _build_section( + section_number, + current_heading, + current_heading_level, + current_elements, + doc, + ) + sections.append(section) + + for elem in soup.children: + if not hasattr(elem, "name") or elem.name is None: + continue + + if elem.name in ("h1", "h2"): + # Flush previous section + _flush_section() + current_heading = elem.get_text(strip=True) + current_heading_level = elem.name + current_elements = [] + else: + current_elements.append(elem) + + # Flush last section + _flush_section() + + # If no sections were created (no headings), create one default section + if not sections: + section_number = 1 + all_elements = [e for e in soup.children if hasattr(e, "name") and e.name] + section = _build_section( + 1, + Path(self.docx_path).stem, + "h1", + all_elements, + doc, + ) + sections = [section] + + # --- Collect language statistics --- + detector = LanguageDetector(min_confidence=0.15) + languages_detected: dict[str, int] = {} + total_code_blocks = 0 + + for section in sections: + for code_sample in section.get("code_samples", []): + lang = code_sample.get("language", "") + if lang: + languages_detected[lang] = languages_detected.get(lang, 0) + 1 + total_code_blocks += 1 + + # Detect languages for samples without language + for section in sections: + for code_sample in section.get("code_samples", []): + if not code_sample.get("language"): + code = code_sample.get("code", "") + if code: + lang, confidence = detector.detect_from_code(code) + if lang and confidence >= 0.3: + code_sample["language"] = lang + languages_detected[lang] = languages_detected.get(lang, 0) + 1 + + result_data = { + "source_file": self.docx_path, + "metadata": metadata, + "total_sections": len(sections), + "total_code_blocks": total_code_blocks, + "total_images": sum(len(s.get("images", [])) for s in sections), + "languages_detected": languages_detected, + "pages": sections, # "pages" key for pipeline compatibility + } + + # Save extracted data + os.makedirs(os.path.dirname(self.data_file), exist_ok=True) + with open(self.data_file, "w", encoding="utf-8") as f: + json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) + + print(f"\n💾 Saved extracted data to: {self.data_file}") + self.extracted_data = result_data + print( + f"✅ Extracted {len(sections)} sections, " + f"{total_code_blocks} code blocks, " + f"{result_data['total_images']} images" + ) + return True + + def load_extracted_data(self, json_path): + """Load previously extracted data from JSON.""" + print(f"\n📂 Loading extracted data from: {json_path}") + with open(json_path, encoding="utf-8") as f: + self.extracted_data = json.load(f) + total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", []))) + print(f"✅ Loaded {total} sections") + return True + + def categorize_content(self): + """Categorize sections based on headings or keywords.""" + print("\n📋 Categorizing content...") + + categorized = {} + sections = self.extracted_data.get("pages", []) + + # For single Word source, use single category with all sections + if self.docx_path: + docx_basename = Path(self.docx_path).stem + category_key = self._sanitize_filename(docx_basename) + categorized[category_key] = { + "title": docx_basename, + "pages": sections, + } + print("✅ Created 1 category (single Word source)") + print(f" - {docx_basename}: {len(sections)} sections") + return categorized + + # Keyword-based categorization (multi-source scenario) + if self.categories: + first_value = next(iter(self.categories.values()), None) + if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict): + # Already categorized format + for cat_key, pages in self.categories.items(): + categorized[cat_key] = { + "title": cat_key.replace("_", " ").title(), + "pages": pages, + } + else: + # Keyword-based categorization + for cat_key in self.categories: + categorized[cat_key] = { + "title": cat_key.replace("_", " ").title(), + "pages": [], + } + + for section in sections: + text = section.get("text", "").lower() + heading_text = section.get("heading", "").lower() + + scores = {} + for cat_key, keywords in self.categories.items(): + if isinstance(keywords, list): + score = sum( + 1 + for kw in keywords + if isinstance(kw, str) + and (kw.lower() in text or kw.lower() in heading_text) + ) + else: + score = 0 + if score > 0: + scores[cat_key] = score + + if scores: + best_cat = max(scores, key=scores.get) + categorized[best_cat]["pages"].append(section) + else: + if "other" not in categorized: + categorized["other"] = {"title": "Other", "pages": []} + categorized["other"]["pages"].append(section) + else: + # No categorization - single category + categorized["content"] = {"title": "Content", "pages": sections} + + print(f"✅ Created {len(categorized)} categories") + for _cat_key, cat_data in categorized.items(): + print(f" - {cat_data['title']}: {len(cat_data['pages'])} sections") + + return categorized + + def build_skill(self): + """Build complete skill structure.""" + print(f"\n🏗️ Building skill: {self.name}") + + # Create directories + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + # Categorize content + categorized = self.categorize_content() + + # Generate reference files + print("\n📝 Generating reference files...") + total_sections = len(categorized) + section_num = 1 + for cat_key, cat_data in categorized.items(): + self._generate_reference_file(cat_key, cat_data, section_num, total_sections) + section_num += 1 + + # Generate index + self._generate_index(categorized) + + # Generate SKILL.md + self._generate_skill_md(categorized) + + print(f"\n✅ Skill built successfully: {self.skill_dir}/") + print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/") + + def _generate_reference_file(self, _cat_key, cat_data, section_num, total_sections): + """Generate a reference markdown file for a category.""" + sections = cat_data["pages"] + + # Use docx basename for filename + docx_basename = "" + if self.docx_path: + docx_basename = Path(self.docx_path).stem + + if sections: + section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)] + + if total_sections == 1: + filename = ( + f"{self.skill_dir}/references/{docx_basename}.md" + if docx_basename + else f"{self.skill_dir}/references/main.md" + ) + else: + sec_range = f"s{min(section_nums)}-s{max(section_nums)}" + base_name = docx_basename if docx_basename else "section" + filename = f"{self.skill_dir}/references/{base_name}_{sec_range}.md" + else: + filename = f"{self.skill_dir}/references/section_{section_num:02d}.md" + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {cat_data['title']}\n\n") + + for section in sections: + sec_num = section.get("section_number", "?") + heading = section.get("heading", "") + heading_level = section.get("heading_level", "h1") + + f.write(f"---\n\n**📄 Source: Section {sec_num}**\n\n") + + # Add heading + if heading: + md_level = "#" * (int(heading_level[1]) + 1) if heading_level else "##" + f.write(f"{md_level} {heading}\n\n") + + # Add sub-headings (h3+) found within the section + for sub_heading in section.get("headings", []): + sub_level = sub_heading.get("level", "h3") + sub_text = sub_heading.get("text", "") + if sub_text: + sub_md = "#" * (int(sub_level[1]) + 1) if sub_level else "###" + f.write(f"{sub_md} {sub_text}\n\n") + + # Add text content + if section.get("text"): + f.write(f"{section['text']}\n\n") + + # Add code samples + code_list = section.get("code_samples", []) + if code_list: + f.write("### Code Examples\n\n") + for code in code_list: + lang = code.get("language", "") + f.write(f"```{lang}\n{code['code']}\n```\n\n") + + # Add tables as markdown + tables = section.get("tables", []) + if tables: + f.write("### Tables\n\n") + for table in tables: + headers = table.get("headers", []) + rows = table.get("rows", []) + if headers: + f.write("| " + " | ".join(str(h) for h in headers) + " |\n") + f.write("| " + " | ".join("---" for _ in headers) + " |\n") + for row in rows: + f.write("| " + " | ".join(str(c) for c in row) + " |\n") + f.write("\n") + + # Add images + images = section.get("images", []) + if images: + assets_dir = os.path.join(self.skill_dir, "assets") + os.makedirs(assets_dir, exist_ok=True) + + f.write("### Images\n\n") + for img in images: + img_index = img.get("index", 0) + img_data = img.get("data", b"") + img_filename = f"section_{sec_num}_img_{img_index}.png" + img_path = os.path.join(assets_dir, img_filename) + + if isinstance(img_data, (bytes, bytearray)): + with open(img_path, "wb") as img_file: + img_file.write(img_data) + f.write(f"![Image {img_index}](../assets/{img_filename})\n\n") + + f.write("---\n\n") + + print(f" Generated: {filename}") + + def _generate_index(self, categorized): + """Generate reference index.""" + filename = f"{self.skill_dir}/references/index.md" + + docx_basename = "" + if self.docx_path: + docx_basename = Path(self.docx_path).stem + + total_sections = len(categorized) + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {self.name.title()} Documentation Reference\n\n") + f.write("## Categories\n\n") + + section_num = 1 + for _cat_key, cat_data in categorized.items(): + sections = cat_data["pages"] + section_count = len(sections) + + if sections: + section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)] + sec_range_str = f"Sections {min(section_nums)}-{max(section_nums)}" + + if total_sections == 1: + link_filename = f"{docx_basename}.md" if docx_basename else "main.md" + else: + sec_range = f"s{min(section_nums)}-s{max(section_nums)}" + base_name = docx_basename if docx_basename else "section" + link_filename = f"{base_name}_{sec_range}.md" + else: + link_filename = f"section_{section_num:02d}.md" + sec_range_str = "N/A" + + f.write( + f"- [{cat_data['title']}]({link_filename}) " + f"({section_count} sections, {sec_range_str})\n" + ) + section_num += 1 + + f.write("\n## Statistics\n\n") + f.write(f"- Total sections: {self.extracted_data.get('total_sections', 0)}\n") + f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n") + f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n") + + # Metadata + metadata = self.extracted_data.get("metadata", {}) + if metadata.get("author"): + f.write(f"- Author: {metadata['author']}\n") + if metadata.get("created"): + f.write(f"- Created: {metadata['created']}\n") + + print(f" Generated: {filename}") + + def _generate_skill_md(self, categorized): + """Generate main SKILL.md file.""" + filename = f"{self.skill_dir}/SKILL.md" + + skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] + desc = self.description[:1024] if len(self.description) > 1024 else self.description + + with open(filename, "w", encoding="utf-8") as f: + # YAML frontmatter + f.write("---\n") + f.write(f"name: {skill_name}\n") + f.write(f"description: {desc}\n") + f.write("---\n\n") + + f.write(f"# {self.name.title()} Documentation Skill\n\n") + f.write(f"{self.description}\n\n") + + # Document metadata + metadata = self.extracted_data.get("metadata", {}) + if any(metadata.values()): + f.write("## 📋 Document Information\n\n") + if metadata.get("title"): + f.write(f"**Title:** {metadata['title']}\n\n") + if metadata.get("author"): + f.write(f"**Author:** {metadata['author']}\n\n") + if metadata.get("created"): + f.write(f"**Created:** {metadata['created']}\n\n") + if metadata.get("modified"): + f.write(f"**Modified:** {metadata['modified']}\n\n") + + # When to Use + f.write("## 💡 When to Use This Skill\n\n") + f.write("Use this skill when you need to:\n") + f.write(f"- Understand {self.name} concepts and fundamentals\n") + f.write("- Look up API references and technical specifications\n") + f.write("- Find code examples and implementation patterns\n") + f.write("- Review tutorials, guides, and best practices\n") + f.write("- Explore the complete documentation structure\n\n") + + # Section Overview + total_sections = self.extracted_data.get("total_sections", 0) + f.write("## 📖 Section Overview\n\n") + f.write(f"**Total Sections:** {total_sections}\n\n") + f.write("**Content Breakdown:**\n\n") + for _cat_key, cat_data in categorized.items(): + section_count = len(cat_data["pages"]) + f.write(f"- **{cat_data['title']}**: {section_count} sections\n") + f.write("\n") + + # Key Concepts from headings + f.write(self._format_key_concepts()) + + # Quick Reference patterns + f.write("## ⚡ Quick Reference\n\n") + f.write(self._format_patterns_from_content()) + + # Code examples (top 15, grouped by language) + all_code = [] + for section in self.extracted_data.get("pages", []): + all_code.extend(section.get("code_samples", [])) + + all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True) + top_code = all_code[:15] + + if top_code: + f.write("## 📝 Code Examples\n\n") + f.write("*High-quality examples extracted from documentation*\n\n") + + by_lang: dict[str, list] = {} + for code in top_code: + lang = code.get("language", "unknown") + by_lang.setdefault(lang, []).append(code) + + for lang in sorted(by_lang.keys()): + examples = by_lang[lang] + f.write(f"### {lang.title()} Examples ({len(examples)})\n\n") + for i, code in enumerate(examples[:5], 1): + quality = code.get("quality_score", 0) + code_text = code.get("code", "") + f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n") + f.write(f"```{lang}\n") + if len(code_text) <= 500: + f.write(code_text) + else: + f.write(code_text[:500] + "\n...") + f.write("\n```\n\n") + + # Table Summary (first 5 tables) + all_tables = [] + for section in self.extracted_data.get("pages", []): + for table in section.get("tables", []): + all_tables.append((section.get("heading", ""), table)) + + if all_tables: + f.write("## 📊 Table Summary\n\n") + f.write(f"*{len(all_tables)} table(s) found in document*\n\n") + for section_heading, table in all_tables[:5]: + if section_heading: + f.write(f"**From section: {section_heading}**\n\n") + headers = table.get("headers", []) + rows = table.get("rows", []) + if headers: + f.write("| " + " | ".join(str(h) for h in headers) + " |\n") + f.write("| " + " | ".join("---" for _ in headers) + " |\n") + for row in rows[:5]: + f.write("| " + " | ".join(str(c) for c in row) + " |\n") + f.write("\n") + + # Statistics + f.write("## 📊 Documentation Statistics\n\n") + f.write(f"- **Total Sections**: {total_sections}\n") + f.write(f"- **Code Blocks**: {self.extracted_data.get('total_code_blocks', 0)}\n") + f.write(f"- **Images/Diagrams**: {self.extracted_data.get('total_images', 0)}\n") + f.write(f"- **Tables**: {len(all_tables)}\n") + + langs = self.extracted_data.get("languages_detected", {}) + if langs: + f.write(f"- **Programming Languages**: {len(langs)}\n\n") + f.write("**Language Breakdown:**\n\n") + for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True): + f.write(f"- {lang}: {count} examples\n") + f.write("\n") + + # Navigation + f.write("## 🗺️ Navigation\n\n") + f.write("**Reference Files:**\n\n") + for _cat_key, cat_data in categorized.items(): + cat_file = self._sanitize_filename(cat_data["title"]) + f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n") + f.write("\n") + f.write("See `references/index.md` for complete documentation structure.\n\n") + + # Footer + f.write("---\n\n") + f.write("**Generated by Skill Seeker** | Word Document Scraper\n") + + with open(filename, encoding="utf-8") as f: + line_count = len(f.read().split("\n")) + print(f" Generated: {filename} ({line_count} lines)") + + def _format_key_concepts(self) -> str: + """Extract key concepts from headings across all sections.""" + all_headings = [] + for section in self.extracted_data.get("pages", []): + # Main heading + heading = section.get("heading", "").strip() + level = section.get("heading_level", "h1") + if heading and len(heading) > 3: + all_headings.append((level, heading)) + # Sub-headings + for sub in section.get("headings", []): + text = sub.get("text", "").strip() + sub_level = sub.get("level", "h3") + if text and len(text) > 3: + all_headings.append((sub_level, text)) + + if not all_headings: + return "" + + content = "## 🔑 Key Concepts\n\n" + content += "*Main topics covered in this documentation*\n\n" + + h1_headings = [text for level, text in all_headings if level == "h1"] + h2_headings = [text for level, text in all_headings if level == "h2"] + + if h1_headings: + content += "**Major Topics:**\n\n" + for heading in h1_headings[:10]: + content += f"- {heading}\n" + content += "\n" + + if h2_headings: + content += "**Subtopics:**\n\n" + for heading in h2_headings[:15]: + content += f"- {heading}\n" + content += "\n" + + return content + + def _format_patterns_from_content(self) -> str: + """Extract common patterns from text content.""" + patterns = [] + pattern_keywords = [ + "getting started", + "installation", + "configuration", + "usage", + "api", + "examples", + "tutorial", + "guide", + "best practices", + "troubleshooting", + "faq", + ] + + for section in self.extracted_data.get("pages", []): + heading_text = section.get("heading", "").lower() + sec_num = section.get("section_number", 0) + + for keyword in pattern_keywords: + if keyword in heading_text: + patterns.append( + { + "type": keyword.title(), + "heading": section.get("heading", ""), + "section": sec_num, + } + ) + break + + if not patterns: + return "*See reference files for detailed content*\n\n" + + content = "*Common documentation patterns found:*\n\n" + by_type: dict[str, list] = {} + for pattern in patterns: + ptype = pattern["type"] + by_type.setdefault(ptype, []).append(pattern) + + for ptype in sorted(by_type.keys()): + items = by_type[ptype] + content += f"**{ptype}** ({len(items)} sections):\n" + for item in items[:3]: + content += f"- {item['heading']} (section {item['section']})\n" + content += "\n" + + return content + + def _sanitize_filename(self, name): + """Convert string to safe filename.""" + safe = re.sub(r"[^\w\s-]", "", name.lower()) + safe = re.sub(r"[-\s]+", "_", safe) + return safe + + +# --------------------------------------------------------------------------- +# HTML-to-sections helper (module-level for clarity) +# --------------------------------------------------------------------------- + +def _build_section( + section_number: int, + heading: str | None, + heading_level: str | None, + elements: list, + doc, +) -> dict: + """Build a section dict from a list of BeautifulSoup elements. + + Args: + section_number: 1-based section index + heading: Heading text (or None for preamble) + heading_level: 'h1', 'h2', etc. + elements: List of BeautifulSoup Tag objects belonging to this section + doc: python-docx Document (used for table cross-reference, not currently used) + + Returns: + Section dict compatible with the intermediate JSON format + """ + text_parts = [] + code_samples = [] + tables = [] + sub_headings = [] + images = [] + + for elem in elements: + if not hasattr(elem, "name") or elem.name is None: + continue + + tag = elem.name + + # Sub-headings (h3, h4, h5, h6) within the section + if tag in ("h3", "h4", "h5", "h6"): + sub_text = elem.get_text(strip=True) + if sub_text: + sub_headings.append({"level": tag, "text": sub_text}) + continue + + # Code blocks + if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None): + code_elem = elem.find("code") if tag == "pre" else elem + if code_elem: + code_text = code_elem.get_text() + else: + code_text = elem.get_text() + + code_text = code_text.strip() + if code_text: + # Try to detect language from class attribute + classes = (code_elem or elem).get("class", []) + lang = "" + for cls in classes: + if cls.startswith("language-") or cls.startswith("lang-"): + lang = cls.split("-", 1)[1] + break + + quality_score = _score_code_quality(code_text) + code_samples.append( + {"code": code_text, "language": lang, "quality_score": quality_score} + ) + continue + + # Tables + if tag == "table": + table_data = _extract_table_from_html(elem) + if table_data: + tables.append(table_data) + continue + + # Images + if tag == "img": + # mammoth embeds images as data URIs; extract if present + src = elem.get("src", "") + if src.startswith("data:"): + import base64 + + try: + header, b64data = src.split(",", 1) + img_bytes = base64.b64decode(b64data) + images.append( + { + "index": len(images), + "data": img_bytes, + "width": int(elem.get("width", 0) or 0), + "height": int(elem.get("height", 0) or 0), + } + ) + except Exception: + pass + continue + + # Detect code in

elements that contain
tags (multi-line content) + # Mammoth renders monospace/Courier paragraphs as

with
— not

+        if tag == "p" and elem.find("br"):
+            raw_text = elem.get_text(separator="\n").strip()
+            # Exclude bullet-point / prose lists (•, *, -)
+            if raw_text and not re.search(r"^[•\-\*]\s", raw_text, re.MULTILINE):
+                if _score_code_quality(raw_text) >= 5.5:
+                    quality_score = _score_code_quality(raw_text)
+                    code_samples.append(
+                        {"code": raw_text, "language": "", "quality_score": quality_score}
+                    )
+                    continue
+
+        # Regular text/paragraph content
+        text = elem.get_text(separator=" ", strip=True)
+        if text:
+            text_parts.append(text)
+
+    return {
+        "section_number": section_number,
+        "heading": heading or "",
+        "heading_level": heading_level or "h1",
+        "text": "\n\n".join(text_parts),
+        "headings": sub_headings,
+        "code_samples": code_samples,
+        "tables": tables,
+        "images": images,
+    }
+
+
+def _extract_table_from_html(table_elem) -> dict | None:
+    """Extract headers and rows from a BeautifulSoup  element."""
+    headers = []
+    rows = []
+
+    # Try  first for headers
+    thead = table_elem.find("thead")
+    if thead:
+        header_row = thead.find("tr")
+        if header_row:
+            headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]
+
+    # Body rows
+    tbody = table_elem.find("tbody") or table_elem
+    for row in tbody.find_all("tr"):
+        cells = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
+        # Skip the header row we already captured
+        if cells and cells != headers:
+            rows.append(cells)
+
+    # If no explicit thead, use first row as header
+    if not headers and rows:
+        headers = rows.pop(0)
+
+    if not headers and not rows:
+        return None
+
+    return {"headers": headers, "rows": rows}
+
+
+def _score_code_quality(code: str) -> float:
+    """Simple quality heuristic for code blocks (0-10 scale)."""
+    if not code:
+        return 0.0
+
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+
+    # More lines = more substantial
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+
+    # Has function/class definitions
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+
+    # Has imports/require
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+
+    # Has indentation (common in Python, JS, etc.)
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+
+    # Has assignment, operators, or common code syntax
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+
+    # Very short snippets get penalized
+    if len(code) < 30:
+        score -= 2.0
+
+    return min(10.0, max(0.0, score))
+
+
+def main():
+    from .arguments.word import add_word_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert Word document (.docx) to Claude skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    add_word_arguments(parser)
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = getattr(args, "docx", None) or getattr(args, "from_json", None) or "(none)"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: Word Document Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    if not (getattr(args, "docx", None) or getattr(args, "from_json", None)):
+        parser.error("Must specify --docx or --from-json")
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None) or f"Use when referencing {name} documentation",
+        }
+        try:
+            converter = WordToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Direct DOCX mode
+    if not getattr(args, "name", None):
+        # Auto-detect name from filename
+        args.name = Path(args.docx).stem
+
+    config = {
+        "name": args.name,
+        "docx_path": args.docx,
+        # Pass None so extract_docx() can infer from document metadata (subject/title)
+        "description": getattr(args, "description", None),
+    }
+    if getattr(args, "categories", None):
+        config["categories"] = args.categories
+
+    try:
+        converter = WordToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_docx():
+            print("\n❌ Word extraction failed - see error above", file=sys.stderr)
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            import os
+
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis, enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from pathlib import Path
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from pathlib import Path
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py
index 8266010..57fafef 100644
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:
 
     def test_all_parsers_registered(self):
         """Test that all parsers are registered."""
-        assert len(PARSERS) == 21, f"Expected 21 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 22, f"Expected 22 parsers, got {len(PARSERS)}"
 
     def test_get_parser_names(self):
         """Test getting list of parser names."""
         names = get_parser_names()
-        assert len(names) == 21
+        assert len(names) == 22
         assert "scrape" in names
         assert "github" in names
         assert "package" in names
@@ -242,9 +242,9 @@ class TestBackwardCompatibility:
             assert cmd in names, f"Command '{cmd}' not found in parser registry!"
 
     def test_command_count_matches(self):
-        """Test that we have exactly 21 commands (includes new create and workflows commands)."""
-        assert len(PARSERS) == 21
-        assert len(get_parser_names()) == 21
+        """Test that we have exactly 22 commands (includes new create, workflows, and word commands)."""
+        assert len(PARSERS) == 22
+        assert len(get_parser_names()) == 22
 
 
 if __name__ == "__main__":
diff --git a/tests/test_word_scraper.py b/tests/test_word_scraper.py
new file mode 100644
index 0000000..72dc8c3
--- /dev/null
+++ b/tests/test_word_scraper.py
@@ -0,0 +1,677 @@
+#!/usr/bin/env python3
+"""
+Tests for Word Document Scraper (cli/word_scraper.py)
+
+Tests cover:
+- Config-based initialization
+- Direct DOCX path conversion
+- JSON-based workflow
+- Skill structure generation
+- Categorization
+- Code blocks handling
+- Tables handling
+- Image handling
+- Error handling
+- CLI argument parsing
+"""
+
+import json
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+try:
+    import mammoth  # noqa: F401
+    import docx as python_docx  # noqa: F401
+
+    WORD_AVAILABLE = True
+except ImportError:
+    WORD_AVAILABLE = False
+
+
+def _make_sample_extracted_data(num_sections=2, include_code=False, include_tables=False,
+                                include_images=False):
+    """Helper to build a minimal extracted_data dict for testing."""
+    mock_image_bytes = (
+        b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01"
+        b"\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01"
+        b"\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82"
+    )
+
+    pages = []
+    for i in range(1, num_sections + 1):
+        section = {
+            "section_number": i,
+            "heading": f"Section {i}",
+            "heading_level": "h1",
+            "text": f"Content for section {i}.",
+            "headings": [],
+            "code_samples": [],
+            "tables": [],
+            "images": [],
+        }
+        if include_code:
+            section["code_samples"] = [
+                {"code": f"def hello_{i}():\n    return 'world'", "language": "python",
+                 "quality_score": 7.5}
+            ]
+        if include_tables:
+            section["tables"] = [
+                {"headers": ["Col A", "Col B"], "rows": [["val1", "val2"], ["val3", "val4"]]}
+            ]
+        if include_images:
+            section["images"] = [
+                {"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}
+            ]
+        pages.append(section)
+
+    return {
+        "source_file": "test.docx",
+        "metadata": {"title": "Test Doc", "author": "Test Author", "created": "", "modified": "",
+                     "subject": ""},
+        "total_sections": num_sections,
+        "total_code_blocks": num_sections if include_code else 0,
+        "total_images": num_sections if include_images else 0,
+        "languages_detected": {"python": num_sections} if include_code else {},
+        "pages": pages,
+    }
+
+
+class TestWordToSkillConverterInit(unittest.TestCase):
+    """Test WordToSkillConverter initialization and basic functionality."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        if hasattr(self, "temp_dir"):
+            shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_init_with_name_and_docx_path(self):
+        """Test initialization with name and docx path."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        self.assertEqual(converter.name, "test_skill")
+        self.assertEqual(converter.docx_path, "test.docx")
+
+    def test_init_with_full_config(self):
+        """Test initialization with full config."""
+        config = {
+            "name": "my_skill",
+            "docx_path": "docs/api.docx",
+            "description": "API documentation skill",
+        }
+        converter = self.WordToSkillConverter(config)
+        self.assertEqual(converter.name, "my_skill")
+        self.assertEqual(converter.description, "API documentation skill")
+
+    def test_init_requires_name(self):
+        """Test that missing 'name' field raises an error."""
+        with self.assertRaises((KeyError, TypeError)):
+            self.WordToSkillConverter({})
+
+    def test_default_description_uses_name(self):
+        """Test that default description is generated from name."""
+        config = {"name": "my_api", "docx_path": "api.docx"}
+        converter = self.WordToSkillConverter(config)
+        self.assertIn("my_api", converter.description)
+
+    def test_skill_dir_uses_name(self):
+        """Test that skill_dir is derived from name."""
+        config = {"name": "my_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        self.assertIn("my_skill", converter.skill_dir)
+
+    def test_name_auto_detected_from_filename(self):
+        """Test name can be extracted from filename via infer_description_from_word."""
+        from skill_seekers.cli.word_scraper import infer_description_from_word
+        desc = infer_description_from_word({}, name="my_doc")
+        self.assertIn("my_doc", desc)
+
+
+class TestWordCategorization(unittest.TestCase):
+    """Test content categorization functionality."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_single_docx_creates_single_category(self):
+        """With docx_path set, categorize_content creates a single category."""
+        config = {"name": "test", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.extracted_data = _make_sample_extracted_data(num_sections=3)
+
+        categories = converter.categorize_content()
+
+        self.assertEqual(len(categories), 1)
+        # Category key is sanitized docx basename
+        self.assertIn("test", categories)
+        self.assertEqual(len(categories["test"]["pages"]), 3)
+
+    def test_keyword_based_categorization(self):
+        """Test keyword-based categorization without docx_path."""
+        config = {
+            "name": "test",
+            "docx_path": "",
+            "categories": {
+                "api": ["api", "reference"],
+                "guide": ["getting started", "tutorial"],
+            },
+        }
+        converter = self.WordToSkillConverter(config)
+        converter.docx_path = ""
+        converter.extracted_data = {
+            "pages": [
+                {"section_number": 1, "heading": "API Reference", "text": "api reference docs",
+                 "code_samples": [], "tables": [], "images": []},
+                {"section_number": 2, "heading": "Getting Started", "text": "getting started guide",
+                 "code_samples": [], "tables": [], "images": []},
+            ]
+        }
+
+        categories = converter.categorize_content()
+        self.assertIsInstance(categories, dict)
+        self.assertGreater(len(categories), 0)
+
+    def test_fallback_to_content_category(self):
+        """Without docx_path and no categories config, uses 'content' category."""
+        config = {"name": "test", "docx_path": ""}
+        converter = self.WordToSkillConverter(config)
+        converter.docx_path = ""
+        converter.extracted_data = _make_sample_extracted_data(num_sections=1)
+
+        categories = converter.categorize_content()
+        self.assertIsInstance(categories, dict)
+        self.assertGreater(len(categories), 0)
+
+
+class TestWordSkillBuilding(unittest.TestCase):
+    """Test skill structure generation."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_build_skill_creates_directory_structure(self):
+        """build_skill creates required directory structure."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data()
+
+        converter.build_skill()
+
+        skill_dir = Path(self.temp_dir) / "test_skill"
+        self.assertTrue(skill_dir.exists())
+        self.assertTrue((skill_dir / "references").exists())
+        self.assertTrue((skill_dir / "scripts").exists())
+        self.assertTrue((skill_dir / "assets").exists())
+
+    def test_build_skill_creates_skill_md(self):
+        """build_skill creates SKILL.md with correct content."""
+        config = {
+            "name": "test_skill",
+            "docx_path": "test.docx",
+            "description": "Test description for docs",
+        }
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data()
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        self.assertTrue(skill_md.exists())
+
+        content = skill_md.read_text()
+        self.assertIn("test_skill", content)
+        self.assertIn("Test description for docs", content)
+
+    def test_build_skill_creates_reference_files(self):
+        """build_skill creates reference markdown files."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(num_sections=2)
+
+        converter.build_skill()
+
+        refs_dir = Path(self.temp_dir) / "test_skill" / "references"
+        # Single-source: named after docx basename
+        self.assertTrue((refs_dir / "test.md").exists())
+        self.assertTrue((refs_dir / "index.md").exists())
+
+    def test_skill_md_has_yaml_frontmatter(self):
+        """SKILL.md starts with valid YAML frontmatter."""
+        config = {"name": "myskill", "docx_path": "doc.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "myskill")
+        converter.extracted_data = _make_sample_extracted_data()
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "myskill" / "SKILL.md"
+        content = skill_md.read_text()
+        self.assertTrue(content.startswith("---\n"))
+        self.assertIn("name:", content)
+        self.assertIn("description:", content)
+
+    def test_skill_md_includes_section_overview(self):
+        """SKILL.md includes a Section Overview."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(num_sections=3)
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        content = skill_md.read_text()
+        self.assertIn("Section Overview", content)
+        self.assertIn("Total Sections", content)
+
+
+class TestWordCodeBlocks(unittest.TestCase):
+    """Test code block extraction and inclusion."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_code_blocks_included_in_references(self):
+        """Code blocks are included in reference files."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_code=True)
+
+        converter.build_skill()
+
+        ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md"
+        content = ref_file.read_text()
+        self.assertIn("```python", content)
+        self.assertIn("def hello_", content)
+
+    def test_code_examples_in_skill_md(self):
+        """SKILL.md includes code examples section when code is present."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_code=True)
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        content = skill_md.read_text()
+        self.assertIn("Code Examples", content)
+
+    def test_language_detected_in_statistics(self):
+        """Language statistics are included in SKILL.md."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_code=True)
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        content = skill_md.read_text()
+        self.assertIn("python", content)
+
+
+class TestWordTables(unittest.TestCase):
+    """Test table extraction and rendering."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_tables_rendered_in_references(self):
+        """Tables are rendered as markdown tables in reference files."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_tables=True)
+
+        converter.build_skill()
+
+        ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md"
+        content = ref_file.read_text()
+        # Markdown table syntax
+        self.assertIn("| Col A |", content)
+        self.assertIn("| --- |", content)
+
+    def test_table_summary_in_skill_md(self):
+        """Table summary section appears in SKILL.md when tables exist."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_tables=True)
+
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        content = skill_md.read_text()
+        self.assertIn("Table Summary", content)
+
+
+class TestWordImages(unittest.TestCase):
+    """Test image extraction and handling."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_images_saved_to_assets(self):
+        """Images are saved to the assets/ directory."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_images=True)
+
+        converter.build_skill()
+
+        assets_dir = Path(self.temp_dir) / "test_skill" / "assets"
+        png_files = list(assets_dir.glob("*.png"))
+        self.assertGreater(len(png_files), 0)
+
+    def test_image_references_in_markdown(self):
+        """Images are referenced with markdown syntax in reference files."""
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.extracted_data = _make_sample_extracted_data(include_images=True)
+
+        converter.build_skill()
+
+        ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md"
+        content = ref_file.read_text()
+        self.assertIn("![", content)
+        self.assertIn("../assets/", content)
+
+
+class TestWordErrorHandling(unittest.TestCase):
+    """Test error handling for invalid inputs."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_missing_docx_file_raises_error(self):
+        """extract_docx raises FileNotFoundError for missing file."""
+        config = {"name": "test", "docx_path": "/nonexistent/path/test.docx"}
+        converter = self.WordToSkillConverter(config)
+        with self.assertRaises((FileNotFoundError, RuntimeError)):
+            converter.extract_docx()
+
+    def test_invalid_config_raises_error(self):
+        """Non-dict config raises TypeError or AttributeError."""
+        with self.assertRaises((TypeError, AttributeError)):
+            self.WordToSkillConverter("invalid string")
+
+    def test_missing_name_raises_key_error(self):
+        """Config without 'name' raises KeyError."""
+        with self.assertRaises((KeyError, TypeError)):
+            self.WordToSkillConverter({"docx_path": "test.docx"})
+
+
+class TestWordJSONWorkflow(unittest.TestCase):
+    """Test building skills from extracted JSON."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+        from skill_seekers.cli.word_scraper import WordToSkillConverter
+        self.WordToSkillConverter = WordToSkillConverter
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_load_from_json(self):
+        """load_extracted_data loads the JSON correctly."""
+        extracted_data = _make_sample_extracted_data(num_sections=3)
+        json_path = Path(self.temp_dir) / "extracted.json"
+        json_path.write_text(json.dumps(extracted_data, indent=2))
+
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.load_extracted_data(str(json_path))
+
+        self.assertEqual(converter.extracted_data["total_sections"], 3)
+        self.assertEqual(len(converter.extracted_data["pages"]), 3)
+
+    def test_build_from_json_without_extraction(self):
+        """JSON workflow skips extract_docx() and goes directly to build."""
+        extracted_data = _make_sample_extracted_data(num_sections=2)
+        json_path = Path(self.temp_dir) / "extracted.json"
+        json_path.write_text(json.dumps(extracted_data))
+
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.load_extracted_data(str(json_path))
+
+        self.assertIsNotNone(converter.extracted_data)
+        self.assertEqual(len(converter.extracted_data["pages"]), 2)
+
+    def test_skill_built_from_json_has_skill_md(self):
+        """build_skill() works after load_extracted_data()."""
+        extracted_data = _make_sample_extracted_data(num_sections=2)
+        json_path = Path(self.temp_dir) / "extracted.json"
+        json_path.write_text(json.dumps(extracted_data))
+
+        config = {"name": "test_skill", "docx_path": "test.docx"}
+        converter = self.WordToSkillConverter(config)
+        converter.skill_dir = str(Path(self.temp_dir) / "test_skill")
+        converter.load_extracted_data(str(json_path))
+        converter.build_skill()
+
+        skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md"
+        self.assertTrue(skill_md.exists())
+
+
+class TestWordCLIArguments(unittest.TestCase):
+    """Test word subcommand CLI argument parsing via the main CLI."""
+
+    def setUp(self):
+        import sys
+        from pathlib import Path as P
+
+        sys.path.insert(0, str(P(__file__).parent.parent / "src"))
+        from skill_seekers.cli.main import create_parser
+
+        self.parser = create_parser()
+
+    def test_docx_argument_accepted(self):
+        """--docx flag is accepted for the word subcommand."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx"])
+        self.assertEqual(args.docx, "test.docx")
+
+    def test_api_key_accepted(self):
+        """--api-key is accepted for word subcommand."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx", "--api-key", "sk-ant-test"])
+        self.assertEqual(args.api_key, "sk-ant-test")
+
+    def test_enhance_level_accepted(self):
+        """--enhance-level is accepted for word subcommand."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx", "--enhance-level", "1"])
+        self.assertEqual(args.enhance_level, 1)
+
+    def test_enhance_workflow_accepted(self):
+        """--enhance-workflow is accepted and stores a list."""
+        args = self.parser.parse_args(
+            ["word", "--docx", "test.docx", "--enhance-workflow", "minimal"]
+        )
+        self.assertEqual(args.enhance_workflow, ["minimal"])
+
+    def test_workflow_dry_run_accepted(self):
+        """--workflow-dry-run is accepted."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx", "--workflow-dry-run"])
+        self.assertTrue(args.workflow_dry_run)
+
+    def test_dry_run_accepted(self):
+        """--dry-run is accepted for word subcommand."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx", "--dry-run"])
+        self.assertTrue(args.dry_run)
+
+    def test_from_json_accepted(self):
+        """--from-json is accepted."""
+        args = self.parser.parse_args(["word", "--from-json", "data.json"])
+        self.assertEqual(args.from_json, "data.json")
+
+    def test_name_accepted(self):
+        """--name is accepted."""
+        args = self.parser.parse_args(["word", "--docx", "test.docx", "--name", "myskill"])
+        self.assertEqual(args.name, "myskill")
+
+
+class TestWordHelperFunctions(unittest.TestCase):
+    """Test module-level helper functions."""
+
+    def setUp(self):
+        if not WORD_AVAILABLE:
+            self.skipTest("mammoth and python-docx not installed")
+
+    def test_build_section_basic(self):
+        """_build_section returns a well-formed dict."""
+        from skill_seekers.cli.word_scraper import _build_section
+        from bs4 import BeautifulSoup
+
+        html = "

Hello world.

Second paragraph.

" + soup = BeautifulSoup(html, "html.parser") + elements = list(soup.children) + + section = _build_section(1, "Intro", "h1", elements, None) + + self.assertEqual(section["section_number"], 1) + self.assertEqual(section["heading"], "Intro") + self.assertEqual(section["heading_level"], "h1") + self.assertIn("Hello world", section["text"]) + + def test_extract_table_from_html(self): + """_extract_table_from_html extracts headers and rows.""" + from skill_seekers.cli.word_scraper import _extract_table_from_html + from bs4 import BeautifulSoup + + html = """ +
+ + + + + +
NameValue
foo1
bar2
""" + soup = BeautifulSoup(html, "html.parser") + table_elem = soup.find("table") + + result = _extract_table_from_html(table_elem) + + self.assertIsNotNone(result) + self.assertEqual(result["headers"], ["Name", "Value"]) + self.assertEqual(len(result["rows"]), 2) + self.assertIn(["foo", "1"], result["rows"]) + + def test_score_code_quality_basic(self): + """_score_code_quality returns a score in [0, 10].""" + from skill_seekers.cli.word_scraper import _score_code_quality + + score = _score_code_quality("def foo():\n return 'bar'\n") + self.assertGreaterEqual(score, 0.0) + self.assertLessEqual(score, 10.0) + + def test_score_code_quality_empty(self): + """_score_code_quality returns 0.0 for empty code.""" + from skill_seekers.cli.word_scraper import _score_code_quality + + self.assertEqual(_score_code_quality(""), 0.0) + + def test_infer_description_from_word_subject(self): + """infer_description_from_word uses subject field when available.""" + from skill_seekers.cli.word_scraper import infer_description_from_word + + metadata = {"title": "Some Doc", "subject": "Writing API documentation for REST services"} + desc = infer_description_from_word(metadata, "api_docs") + self.assertIn("writing api documentation", desc.lower()) + + def test_infer_description_from_word_fallback(self): + """infer_description_from_word falls back to name.""" + from skill_seekers.cli.word_scraper import infer_description_from_word + + desc = infer_description_from_word({}, name="myskill") + self.assertIn("myskill", desc) + + +class TestWordSourceDetection(unittest.TestCase): + """Test .docx source detection in SourceDetector.""" + + def test_docx_detected_as_word_type(self): + """SourceDetector.detect() returns type='word' for .docx files.""" + from skill_seekers.cli.source_detector import SourceDetector + + # Use a path that ends in .docx (doesn't need to exist for detection) + source_info = SourceDetector.detect("/tmp/test_document.docx") + self.assertEqual(source_info.type, "word") + self.assertEqual(source_info.parsed["file_path"], "/tmp/test_document.docx") + self.assertEqual(source_info.suggested_name, "test_document") + + def test_docx_validation_missing_file(self): + """validate_source raises ValueError for missing .docx file.""" + from skill_seekers.cli.source_detector import SourceDetector + + source_info = SourceDetector.detect("/tmp/nonexistent_12345.docx") + with self.assertRaises(ValueError) as ctx: + SourceDetector.validate_source(source_info) + self.assertIn("does not exist", str(ctx.exception)) + + def test_pdf_still_detected(self): + """Existing PDF detection is unaffected by Word support.""" + from skill_seekers.cli.source_detector import SourceDetector + + source_info = SourceDetector.detect("/tmp/test.pdf") + self.assertEqual(source_info.type, "pdf") + + +if __name__ == "__main__": + unittest.main() diff --git a/uv.lock b/uv.lock index ca708f7..6d7bf71 100644 --- a/uv.lock +++ b/uv.lock @@ -684,6 +684,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, ] +[[package]] +name = "cobble" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805, upload-time = "2024-06-01T18:11:09.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984, upload-time = "2024-06-01T18:11:07.911Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -2391,6 +2400,142 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/50/c5ccd2a50daa0a10c7f3f7d4e6992392454198cd8a7d99fcb96cb60d0686/llama_parse-0.6.54-py3-none-any.whl", hash = "sha256:c66c8d51cf6f29a44eaa8595a595de5d2598afc86e5a33a4cebe5fe228036920", size = 4879, upload-time = "2025-08-01T20:09:22.651Z" }, ] +[[package]] +name = "lxml" +version = "6.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/88/262177de60548e5a2bfc46ad28232c9e9cbde697bd94132aeb80364675cb/lxml-6.0.2.tar.gz", hash = "sha256:cd79f3367bd74b317dda655dc8fcfa304d9eb6e4fb06b7168c5cf27f96e0cd62", size = 4073426, upload-time = "2025-09-22T04:04:59.287Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/8a/f8192a08237ef2fb1b19733f709db88a4c43bc8ab8357f01cb41a27e7f6a/lxml-6.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e77dd455b9a16bbd2a5036a63ddbd479c19572af81b624e79ef422f929eef388", size = 8590589, upload-time = "2025-09-22T04:00:10.51Z" }, + { url = "https://files.pythonhosted.org/packages/12/64/27bcd07ae17ff5e5536e8d88f4c7d581b48963817a13de11f3ac3329bfa2/lxml-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d444858b9f07cefff6455b983aea9a67f7462ba1f6cbe4a21e8bf6791bf2153", size = 4629671, upload-time = "2025-09-22T04:00:15.411Z" }, + { url = "https://files.pythonhosted.org/packages/02/5a/a7d53b3291c324e0b6e48f3c797be63836cc52156ddf8f33cd72aac78866/lxml-6.0.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f952dacaa552f3bb8834908dddd500ba7d508e6ea6eb8c52eb2d28f48ca06a31", size = 4999961, upload-time = "2025-09-22T04:00:17.619Z" }, + { url = "https://files.pythonhosted.org/packages/f5/55/d465e9b89df1761674d8672bb3e4ae2c47033b01ec243964b6e334c6743f/lxml-6.0.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:71695772df6acea9f3c0e59e44ba8ac50c4f125217e84aab21074a1a55e7e5c9", size = 5157087, upload-time = "2025-09-22T04:00:19.868Z" }, + { url = "https://files.pythonhosted.org/packages/62/38/3073cd7e3e8dfc3ba3c3a139e33bee3a82de2bfb0925714351ad3d255c13/lxml-6.0.2-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f68764f35fd78d7c4cc4ef209a184c38b65440378013d24b8aecd327c3e0c8", size = 5067620, upload-time = "2025-09-22T04:00:21.877Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d3/1e001588c5e2205637b08985597827d3827dbaaece16348c8822bfe61c29/lxml-6.0.2-cp310-cp310-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:058027e261afed589eddcfe530fcc6f3402d7fd7e89bfd0532df82ebc1563dba", size = 5406664, upload-time = "2025-09-22T04:00:23.714Z" }, + { url = "https://files.pythonhosted.org/packages/20/cf/cab09478699b003857ed6ebfe95e9fb9fa3d3c25f1353b905c9b73cfb624/lxml-6.0.2-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8ffaeec5dfea5881d4c9d8913a32d10cfe3923495386106e4a24d45300ef79c", size = 5289397, upload-time = "2025-09-22T04:00:25.544Z" }, + { url = "https://files.pythonhosted.org/packages/a3/84/02a2d0c38ac9a8b9f9e5e1bbd3f24b3f426044ad618b552e9549ee91bd63/lxml-6.0.2-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:f2e3b1a6bb38de0bc713edd4d612969dd250ca8b724be8d460001a387507021c", size = 4772178, upload-time = "2025-09-22T04:00:27.602Z" }, + { url = "https://files.pythonhosted.org/packages/56/87/e1ceadcc031ec4aa605fe95476892d0b0ba3b7f8c7dcdf88fdeff59a9c86/lxml-6.0.2-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d6690ec5ec1cce0385cb20896b16be35247ac8c2046e493d03232f1c2414d321", size = 5358148, upload-time = "2025-09-22T04:00:29.323Z" }, + { url = "https://files.pythonhosted.org/packages/fe/13/5bb6cf42bb228353fd4ac5f162c6a84fd68a4d6f67c1031c8cf97e131fc6/lxml-6.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2a50c3c1d11cad0ebebbac357a97b26aa79d2bcaf46f256551152aa85d3a4d1", size = 5112035, upload-time = "2025-09-22T04:00:31.061Z" }, + { url = "https://files.pythonhosted.org/packages/e4/e2/ea0498552102e59834e297c5c6dff8d8ded3db72ed5e8aad77871476f073/lxml-6.0.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3efe1b21c7801ffa29a1112fab3b0f643628c30472d507f39544fd48e9549e34", size = 4799111, upload-time = "2025-09-22T04:00:33.11Z" }, + { url = "https://files.pythonhosted.org/packages/6a/9e/8de42b52a73abb8af86c66c969b3b4c2a96567b6ac74637c037d2e3baa60/lxml-6.0.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:59c45e125140b2c4b33920d21d83681940ca29f0b83f8629ea1a2196dc8cfe6a", size = 5351662, upload-time = "2025-09-22T04:00:35.237Z" }, + { url = "https://files.pythonhosted.org/packages/28/a2/de776a573dfb15114509a37351937c367530865edb10a90189d0b4b9b70a/lxml-6.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:452b899faa64f1805943ec1c0c9ebeaece01a1af83e130b69cdefeda180bb42c", size = 5314973, upload-time = "2025-09-22T04:00:37.086Z" }, + { url = "https://files.pythonhosted.org/packages/50/a0/3ae1b1f8964c271b5eec91db2043cf8c6c0bce101ebb2a633b51b044db6c/lxml-6.0.2-cp310-cp310-win32.whl", hash = "sha256:1e786a464c191ca43b133906c6903a7e4d56bef376b75d97ccbb8ec5cf1f0a4b", size = 3611953, upload-time = "2025-09-22T04:00:39.224Z" }, + { url = "https://files.pythonhosted.org/packages/d1/70/bd42491f0634aad41bdfc1e46f5cff98825fb6185688dc82baa35d509f1a/lxml-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:dacf3c64ef3f7440e3167aa4b49aa9e0fb99e0aa4f9ff03795640bf94531bcb0", size = 4032695, upload-time = "2025-09-22T04:00:41.402Z" }, + { url = "https://files.pythonhosted.org/packages/d2/d0/05c6a72299f54c2c561a6c6cbb2f512e047fca20ea97a05e57931f194ac4/lxml-6.0.2-cp310-cp310-win_arm64.whl", hash = "sha256:45f93e6f75123f88d7f0cfd90f2d05f441b808562bf0bc01070a00f53f5028b5", size = 3680051, upload-time = "2025-09-22T04:00:43.525Z" }, + { url = "https://files.pythonhosted.org/packages/77/d5/becbe1e2569b474a23f0c672ead8a29ac50b2dc1d5b9de184831bda8d14c/lxml-6.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:13e35cbc684aadf05d8711a5d1b5857c92e5e580efa9a0d2be197199c8def607", size = 8634365, upload-time = "2025-09-22T04:00:45.672Z" }, + { url = "https://files.pythonhosted.org/packages/28/66/1ced58f12e804644426b85d0bb8a4478ca77bc1761455da310505f1a3526/lxml-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b1675e096e17c6fe9c0e8c81434f5736c0739ff9ac6123c87c2d452f48fc938", size = 4650793, upload-time = "2025-09-22T04:00:47.783Z" }, + { url = "https://files.pythonhosted.org/packages/11/84/549098ffea39dfd167e3f174b4ce983d0eed61f9d8d25b7bf2a57c3247fc/lxml-6.0.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac6e5811ae2870953390452e3476694196f98d447573234592d30488147404d", size = 4944362, upload-time = "2025-09-22T04:00:49.845Z" }, + { url = "https://files.pythonhosted.org/packages/ac/bd/f207f16abf9749d2037453d56b643a7471d8fde855a231a12d1e095c4f01/lxml-6.0.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5aa0fc67ae19d7a64c3fe725dc9a1bb11f80e01f78289d05c6f62545affec438", size = 5083152, upload-time = "2025-09-22T04:00:51.709Z" }, + { url = "https://files.pythonhosted.org/packages/15/ae/bd813e87d8941d52ad5b65071b1affb48da01c4ed3c9c99e40abb266fbff/lxml-6.0.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de496365750cc472b4e7902a485d3f152ecf57bd3ba03ddd5578ed8ceb4c5964", size = 5023539, upload-time = "2025-09-22T04:00:53.593Z" }, + { url = "https://files.pythonhosted.org/packages/02/cd/9bfef16bd1d874fbe0cb51afb00329540f30a3283beb9f0780adbb7eec03/lxml-6.0.2-cp311-cp311-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:200069a593c5e40b8f6fc0d84d86d970ba43138c3e68619ffa234bc9bb806a4d", size = 5344853, upload-time = "2025-09-22T04:00:55.524Z" }, + { url = "https://files.pythonhosted.org/packages/b8/89/ea8f91594bc5dbb879734d35a6f2b0ad50605d7fb419de2b63d4211765cc/lxml-6.0.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7d2de809c2ee3b888b59f995625385f74629707c9355e0ff856445cdcae682b7", size = 5225133, upload-time = "2025-09-22T04:00:57.269Z" }, + { url = "https://files.pythonhosted.org/packages/b9/37/9c735274f5dbec726b2db99b98a43950395ba3d4a1043083dba2ad814170/lxml-6.0.2-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:b2c3da8d93cf5db60e8858c17684c47d01fee6405e554fb55018dd85fc23b178", size = 4677944, upload-time = "2025-09-22T04:00:59.052Z" }, + { url = "https://files.pythonhosted.org/packages/20/28/7dfe1ba3475d8bfca3878365075abe002e05d40dfaaeb7ec01b4c587d533/lxml-6.0.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:442de7530296ef5e188373a1ea5789a46ce90c4847e597856570439621d9c553", size = 5284535, upload-time = "2025-09-22T04:01:01.335Z" }, + { url = "https://files.pythonhosted.org/packages/e7/cf/5f14bc0de763498fc29510e3532bf2b4b3a1c1d5d0dff2e900c16ba021ef/lxml-6.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2593c77efde7bfea7f6389f1ab249b15ed4aa5bc5cb5131faa3b843c429fbedb", size = 5067343, upload-time = "2025-09-22T04:01:03.13Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b0/bb8275ab5472f32b28cfbbcc6db7c9d092482d3439ca279d8d6fa02f7025/lxml-6.0.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:3e3cb08855967a20f553ff32d147e14329b3ae70ced6edc2f282b94afbc74b2a", size = 4725419, upload-time = "2025-09-22T04:01:05.013Z" }, + { url = "https://files.pythonhosted.org/packages/25/4c/7c222753bc72edca3b99dbadba1b064209bc8ed4ad448af990e60dcce462/lxml-6.0.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:2ed6c667fcbb8c19c6791bbf40b7268ef8ddf5a96940ba9404b9f9a304832f6c", size = 5275008, upload-time = "2025-09-22T04:01:07.327Z" }, + { url = "https://files.pythonhosted.org/packages/6c/8c/478a0dc6b6ed661451379447cdbec77c05741a75736d97e5b2b729687828/lxml-6.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b8f18914faec94132e5b91e69d76a5c1d7b0c73e2489ea8929c4aaa10b76bbf7", size = 5248906, upload-time = "2025-09-22T04:01:09.452Z" }, + { url = "https://files.pythonhosted.org/packages/2d/d9/5be3a6ab2784cdf9accb0703b65e1b64fcdd9311c9f007630c7db0cfcce1/lxml-6.0.2-cp311-cp311-win32.whl", hash = "sha256:6605c604e6daa9e0d7f0a2137bdc47a2e93b59c60a65466353e37f8272f47c46", size = 3610357, upload-time = "2025-09-22T04:01:11.102Z" }, + { url = "https://files.pythonhosted.org/packages/e2/7d/ca6fb13349b473d5732fb0ee3eec8f6c80fc0688e76b7d79c1008481bf1f/lxml-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e5867f2651016a3afd8dd2c8238baa66f1e2802f44bc17e236f547ace6647078", size = 4036583, upload-time = "2025-09-22T04:01:12.766Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a2/51363b5ecd3eab46563645f3a2c3836a2fc67d01a1b87c5017040f39f567/lxml-6.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:4197fb2534ee05fd3e7afaab5d8bfd6c2e186f65ea7f9cd6a82809c887bd1285", size = 3680591, upload-time = "2025-09-22T04:01:14.874Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c8/8ff2bc6b920c84355146cd1ab7d181bc543b89241cfb1ebee824a7c81457/lxml-6.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a59f5448ba2ceccd06995c95ea59a7674a10de0810f2ce90c9006f3cbc044456", size = 8661887, upload-time = "2025-09-22T04:01:17.265Z" }, + { url = "https://files.pythonhosted.org/packages/37/6f/9aae1008083bb501ef63284220ce81638332f9ccbfa53765b2b7502203cf/lxml-6.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8113639f3296706fbac34a30813929e29247718e88173ad849f57ca59754924", size = 4667818, upload-time = "2025-09-22T04:01:19.688Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ca/31fb37f99f37f1536c133476674c10b577e409c0a624384147653e38baf2/lxml-6.0.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a8bef9b9825fa8bc816a6e641bb67219489229ebc648be422af695f6e7a4fa7f", size = 4950807, upload-time = "2025-09-22T04:01:21.487Z" }, + { url = "https://files.pythonhosted.org/packages/da/87/f6cb9442e4bada8aab5ae7e1046264f62fdbeaa6e3f6211b93f4c0dd97f1/lxml-6.0.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:65ea18d710fd14e0186c2f973dc60bb52039a275f82d3c44a0e42b43440ea534", size = 5109179, upload-time = "2025-09-22T04:01:23.32Z" }, + { url = "https://files.pythonhosted.org/packages/c8/20/a7760713e65888db79bbae4f6146a6ae5c04e4a204a3c48896c408cd6ed2/lxml-6.0.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c371aa98126a0d4c739ca93ceffa0fd7a5d732e3ac66a46e74339acd4d334564", size = 5023044, upload-time = "2025-09-22T04:01:25.118Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b0/7e64e0460fcb36471899f75831509098f3fd7cd02a3833ac517433cb4f8f/lxml-6.0.2-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:700efd30c0fa1a3581d80a748157397559396090a51d306ea59a70020223d16f", size = 5359685, upload-time = "2025-09-22T04:01:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/b9/e1/e5df362e9ca4e2f48ed6411bd4b3a0ae737cc842e96877f5bf9428055ab4/lxml-6.0.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c33e66d44fe60e72397b487ee92e01da0d09ba2d66df8eae42d77b6d06e5eba0", size = 5654127, upload-time = "2025-09-22T04:01:29.629Z" }, + { url = "https://files.pythonhosted.org/packages/c6/d1/232b3309a02d60f11e71857778bfcd4acbdb86c07db8260caf7d008b08f8/lxml-6.0.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90a345bbeaf9d0587a3aaffb7006aa39ccb6ff0e96a57286c0cb2fd1520ea192", size = 5253958, upload-time = "2025-09-22T04:01:31.535Z" }, + { url = "https://files.pythonhosted.org/packages/35/35/d955a070994725c4f7d80583a96cab9c107c57a125b20bb5f708fe941011/lxml-6.0.2-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:064fdadaf7a21af3ed1dcaa106b854077fbeada827c18f72aec9346847cd65d0", size = 4711541, upload-time = "2025-09-22T04:01:33.801Z" }, + { url = "https://files.pythonhosted.org/packages/1e/be/667d17363b38a78c4bd63cfd4b4632029fd68d2c2dc81f25ce9eb5224dd5/lxml-6.0.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fbc74f42c3525ac4ffa4b89cbdd00057b6196bcefe8bce794abd42d33a018092", size = 5267426, upload-time = "2025-09-22T04:01:35.639Z" }, + { url = "https://files.pythonhosted.org/packages/ea/47/62c70aa4a1c26569bc958c9ca86af2bb4e1f614e8c04fb2989833874f7ae/lxml-6.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6ddff43f702905a4e32bc24f3f2e2edfe0f8fde3277d481bffb709a4cced7a1f", size = 5064917, upload-time = "2025-09-22T04:01:37.448Z" }, + { url = "https://files.pythonhosted.org/packages/bd/55/6ceddaca353ebd0f1908ef712c597f8570cc9c58130dbb89903198e441fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6da5185951d72e6f5352166e3da7b0dc27aa70bd1090b0eb3f7f7212b53f1bb8", size = 4788795, upload-time = "2025-09-22T04:01:39.165Z" }, + { url = "https://files.pythonhosted.org/packages/cf/e8/fd63e15da5e3fd4c2146f8bbb3c14e94ab850589beab88e547b2dbce22e1/lxml-6.0.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:57a86e1ebb4020a38d295c04fc79603c7899e0df71588043eb218722dabc087f", size = 5676759, upload-time = "2025-09-22T04:01:41.506Z" }, + { url = "https://files.pythonhosted.org/packages/76/47/b3ec58dc5c374697f5ba37412cd2728f427d056315d124dd4b61da381877/lxml-6.0.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2047d8234fe735ab77802ce5f2297e410ff40f5238aec569ad7c8e163d7b19a6", size = 5255666, upload-time = "2025-09-22T04:01:43.363Z" }, + { url = "https://files.pythonhosted.org/packages/19/93/03ba725df4c3d72afd9596eef4a37a837ce8e4806010569bedfcd2cb68fd/lxml-6.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6f91fd2b2ea15a6800c8e24418c0775a1694eefc011392da73bc6cef2623b322", size = 5277989, upload-time = "2025-09-22T04:01:45.215Z" }, + { url = "https://files.pythonhosted.org/packages/c6/80/c06de80bfce881d0ad738576f243911fccf992687ae09fd80b734712b39c/lxml-6.0.2-cp312-cp312-win32.whl", hash = "sha256:3ae2ce7d6fedfb3414a2b6c5e20b249c4c607f72cb8d2bb7cc9c6ec7c6f4e849", size = 3611456, upload-time = "2025-09-22T04:01:48.243Z" }, + { url = "https://files.pythonhosted.org/packages/f7/d7/0cdfb6c3e30893463fb3d1e52bc5f5f99684a03c29a0b6b605cfae879cd5/lxml-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:72c87e5ee4e58a8354fb9c7c84cbf95a1c8236c127a5d1b7683f04bed8361e1f", size = 4011793, upload-time = "2025-09-22T04:01:50.042Z" }, + { url = "https://files.pythonhosted.org/packages/ea/7b/93c73c67db235931527301ed3785f849c78991e2e34f3fd9a6663ffda4c5/lxml-6.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:61cb10eeb95570153e0c0e554f58df92ecf5109f75eacad4a95baa709e26c3d6", size = 3672836, upload-time = "2025-09-22T04:01:52.145Z" }, + { url = "https://files.pythonhosted.org/packages/53/fd/4e8f0540608977aea078bf6d79f128e0e2c2bba8af1acf775c30baa70460/lxml-6.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9b33d21594afab46f37ae58dfadd06636f154923c4e8a4d754b0127554eb2e77", size = 8648494, upload-time = "2025-09-22T04:01:54.242Z" }, + { url = "https://files.pythonhosted.org/packages/5d/f4/2a94a3d3dfd6c6b433501b8d470a1960a20ecce93245cf2db1706adf6c19/lxml-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c8963287d7a4c5c9a432ff487c52e9c5618667179c18a204bdedb27310f022f", size = 4661146, upload-time = "2025-09-22T04:01:56.282Z" }, + { url = "https://files.pythonhosted.org/packages/25/2e/4efa677fa6b322013035d38016f6ae859d06cac67437ca7dc708a6af7028/lxml-6.0.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1941354d92699fb5ffe6ed7b32f9649e43c2feb4b97205f75866f7d21aa91452", size = 4946932, upload-time = "2025-09-22T04:01:58.989Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0f/526e78a6d38d109fdbaa5049c62e1d32fdd70c75fb61c4eadf3045d3d124/lxml-6.0.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb2f6ca0ae2d983ded09357b84af659c954722bbf04dea98030064996d156048", size = 5100060, upload-time = "2025-09-22T04:02:00.812Z" }, + { url = "https://files.pythonhosted.org/packages/81/76/99de58d81fa702cc0ea7edae4f4640416c2062813a00ff24bd70ac1d9c9b/lxml-6.0.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb2a12d704f180a902d7fa778c6d71f36ceb7b0d317f34cdc76a5d05aa1dd1df", size = 5019000, upload-time = "2025-09-22T04:02:02.671Z" }, + { url = "https://files.pythonhosted.org/packages/b5/35/9e57d25482bc9a9882cb0037fdb9cc18f4b79d85df94fa9d2a89562f1d25/lxml-6.0.2-cp313-cp313-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:6ec0e3f745021bfed19c456647f0298d60a24c9ff86d9d051f52b509663feeb1", size = 5348496, upload-time = "2025-09-22T04:02:04.904Z" }, + { url = "https://files.pythonhosted.org/packages/a6/8e/cb99bd0b83ccc3e8f0f528e9aa1f7a9965dfec08c617070c5db8d63a87ce/lxml-6.0.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:846ae9a12d54e368933b9759052d6206a9e8b250291109c48e350c1f1f49d916", size = 5643779, upload-time = "2025-09-22T04:02:06.689Z" }, + { url = "https://files.pythonhosted.org/packages/d0/34/9e591954939276bb679b73773836c6684c22e56d05980e31d52a9a8deb18/lxml-6.0.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef9266d2aa545d7374938fb5c484531ef5a2ec7f2d573e62f8ce722c735685fd", size = 5244072, upload-time = "2025-09-22T04:02:08.587Z" }, + { url = "https://files.pythonhosted.org/packages/8d/27/b29ff065f9aaca443ee377aff699714fcbffb371b4fce5ac4ca759e436d5/lxml-6.0.2-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:4077b7c79f31755df33b795dc12119cb557a0106bfdab0d2c2d97bd3cf3dffa6", size = 4718675, upload-time = "2025-09-22T04:02:10.783Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/f756f9c2cd27caa1a6ef8c32ae47aadea697f5c2c6d07b0dae133c244fbe/lxml-6.0.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a7c5d5e5f1081955358533be077166ee97ed2571d6a66bdba6ec2f609a715d1a", size = 5255171, upload-time = "2025-09-22T04:02:12.631Z" }, + { url = "https://files.pythonhosted.org/packages/61/46/bb85ea42d2cb1bd8395484fd72f38e3389611aa496ac7772da9205bbda0e/lxml-6.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8f8d0cbd0674ee89863a523e6994ac25fd5be9c8486acfc3e5ccea679bad2679", size = 5057175, upload-time = "2025-09-22T04:02:14.718Z" }, + { url = "https://files.pythonhosted.org/packages/95/0c/443fc476dcc8e41577f0af70458c50fe299a97bb6b7505bb1ae09aa7f9ac/lxml-6.0.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2cbcbf6d6e924c28f04a43f3b6f6e272312a090f269eff68a2982e13e5d57659", size = 4785688, upload-time = "2025-09-22T04:02:16.957Z" }, + { url = "https://files.pythonhosted.org/packages/48/78/6ef0b359d45bb9697bc5a626e1992fa5d27aa3f8004b137b2314793b50a0/lxml-6.0.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dfb874cfa53340009af6bdd7e54ebc0d21012a60a4e65d927c2e477112e63484", size = 5660655, upload-time = "2025-09-22T04:02:18.815Z" }, + { url = "https://files.pythonhosted.org/packages/ff/ea/e1d33808f386bc1339d08c0dcada6e4712d4ed8e93fcad5f057070b7988a/lxml-6.0.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:fb8dae0b6b8b7f9e96c26fdd8121522ce5de9bb5538010870bd538683d30e9a2", size = 5247695, upload-time = "2025-09-22T04:02:20.593Z" }, + { url = "https://files.pythonhosted.org/packages/4f/47/eba75dfd8183673725255247a603b4ad606f4ae657b60c6c145b381697da/lxml-6.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:358d9adae670b63e95bc59747c72f4dc97c9ec58881d4627fe0120da0f90d314", size = 5269841, upload-time = "2025-09-22T04:02:22.489Z" }, + { url = "https://files.pythonhosted.org/packages/76/04/5c5e2b8577bc936e219becb2e98cdb1aca14a4921a12995b9d0c523502ae/lxml-6.0.2-cp313-cp313-win32.whl", hash = "sha256:e8cd2415f372e7e5a789d743d133ae474290a90b9023197fd78f32e2dc6873e2", size = 3610700, upload-time = "2025-09-22T04:02:24.465Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0a/4643ccc6bb8b143e9f9640aa54e38255f9d3b45feb2cbe7ae2ca47e8782e/lxml-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:b30d46379644fbfc3ab81f8f82ae4de55179414651f110a1514f0b1f8f6cb2d7", size = 4010347, upload-time = "2025-09-22T04:02:26.286Z" }, + { url = "https://files.pythonhosted.org/packages/31/ef/dcf1d29c3f530577f61e5fe2f1bd72929acf779953668a8a47a479ae6f26/lxml-6.0.2-cp313-cp313-win_arm64.whl", hash = "sha256:13dcecc9946dca97b11b7c40d29fba63b55ab4170d3c0cf8c0c164343b9bfdcf", size = 3671248, upload-time = "2025-09-22T04:02:27.918Z" }, + { url = "https://files.pythonhosted.org/packages/03/15/d4a377b385ab693ce97b472fe0c77c2b16ec79590e688b3ccc71fba19884/lxml-6.0.2-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:b0c732aa23de8f8aec23f4b580d1e52905ef468afb4abeafd3fec77042abb6fe", size = 8659801, upload-time = "2025-09-22T04:02:30.113Z" }, + { url = "https://files.pythonhosted.org/packages/c8/e8/c128e37589463668794d503afaeb003987373c5f94d667124ffd8078bbd9/lxml-6.0.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4468e3b83e10e0317a89a33d28f7aeba1caa4d1a6fd457d115dd4ffe90c5931d", size = 4659403, upload-time = "2025-09-22T04:02:32.119Z" }, + { url = "https://files.pythonhosted.org/packages/00/ce/74903904339decdf7da7847bb5741fc98a5451b42fc419a86c0c13d26fe2/lxml-6.0.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:abd44571493973bad4598a3be7e1d807ed45aa2adaf7ab92ab7c62609569b17d", size = 4966974, upload-time = "2025-09-22T04:02:34.155Z" }, + { url = "https://files.pythonhosted.org/packages/1f/d3/131dec79ce61c5567fecf82515bd9bc36395df42501b50f7f7f3bd065df0/lxml-6.0.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:370cd78d5855cfbffd57c422851f7d3864e6ae72d0da615fca4dad8c45d375a5", size = 5102953, upload-time = "2025-09-22T04:02:36.054Z" }, + { url = "https://files.pythonhosted.org/packages/3a/ea/a43ba9bb750d4ffdd885f2cd333572f5bb900cd2408b67fdda07e85978a0/lxml-6.0.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:901e3b4219fa04ef766885fb40fa516a71662a4c61b80c94d25336b4934b71c0", size = 5055054, upload-time = "2025-09-22T04:02:38.154Z" }, + { url = "https://files.pythonhosted.org/packages/60/23/6885b451636ae286c34628f70a7ed1fcc759f8d9ad382d132e1c8d3d9bfd/lxml-6.0.2-cp314-cp314-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:a4bf42d2e4cf52c28cc1812d62426b9503cdb0c87a6de81442626aa7d69707ba", size = 5352421, upload-time = "2025-09-22T04:02:40.413Z" }, + { url = "https://files.pythonhosted.org/packages/48/5b/fc2ddfc94ddbe3eebb8e9af6e3fd65e2feba4967f6a4e9683875c394c2d8/lxml-6.0.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2c7fdaa4d7c3d886a42534adec7cfac73860b89b4e5298752f60aa5984641a0", size = 5673684, upload-time = "2025-09-22T04:02:42.288Z" }, + { url = "https://files.pythonhosted.org/packages/29/9c/47293c58cc91769130fbf85531280e8cc7868f7fbb6d92f4670071b9cb3e/lxml-6.0.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98a5e1660dc7de2200b00d53fa00bcd3c35a3608c305d45a7bbcaf29fa16e83d", size = 5252463, upload-time = "2025-09-22T04:02:44.165Z" }, + { url = "https://files.pythonhosted.org/packages/9b/da/ba6eceb830c762b48e711ded880d7e3e89fc6c7323e587c36540b6b23c6b/lxml-6.0.2-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:dc051506c30b609238d79eda75ee9cab3e520570ec8219844a72a46020901e37", size = 4698437, upload-time = "2025-09-22T04:02:46.524Z" }, + { url = "https://files.pythonhosted.org/packages/a5/24/7be3f82cb7990b89118d944b619e53c656c97dc89c28cfb143fdb7cd6f4d/lxml-6.0.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8799481bbdd212470d17513a54d568f44416db01250f49449647b5ab5b5dccb9", size = 5269890, upload-time = "2025-09-22T04:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/1b/bd/dcfb9ea1e16c665efd7538fc5d5c34071276ce9220e234217682e7d2c4a5/lxml-6.0.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9261bb77c2dab42f3ecd9103951aeca2c40277701eb7e912c545c1b16e0e4917", size = 5097185, upload-time = "2025-09-22T04:02:50.746Z" }, + { url = "https://files.pythonhosted.org/packages/21/04/a60b0ff9314736316f28316b694bccbbabe100f8483ad83852d77fc7468e/lxml-6.0.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:65ac4a01aba353cfa6d5725b95d7aed6356ddc0a3cd734de00124d285b04b64f", size = 4745895, upload-time = "2025-09-22T04:02:52.968Z" }, + { url = "https://files.pythonhosted.org/packages/d6/bd/7d54bd1846e5a310d9c715921c5faa71cf5c0853372adf78aee70c8d7aa2/lxml-6.0.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b22a07cbb82fea98f8a2fd814f3d1811ff9ed76d0fc6abc84eb21527596e7cc8", size = 5695246, upload-time = "2025-09-22T04:02:54.798Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/5643d6ab947bc371da21323acb2a6e603cedbe71cb4c99c8254289ab6f4e/lxml-6.0.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:d759cdd7f3e055d6bc8d9bec3ad905227b2e4c785dc16c372eb5b5e83123f48a", size = 5260797, upload-time = "2025-09-22T04:02:57.058Z" }, + { url = "https://files.pythonhosted.org/packages/33/da/34c1ec4cff1eea7d0b4cd44af8411806ed943141804ac9c5d565302afb78/lxml-6.0.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:945da35a48d193d27c188037a05fec5492937f66fb1958c24fc761fb9d40d43c", size = 5277404, upload-time = "2025-09-22T04:02:58.966Z" }, + { url = "https://files.pythonhosted.org/packages/82/57/4eca3e31e54dc89e2c3507e1cd411074a17565fa5ffc437c4ae0a00d439e/lxml-6.0.2-cp314-cp314-win32.whl", hash = "sha256:be3aaa60da67e6153eb15715cc2e19091af5dc75faef8b8a585aea372507384b", size = 3670072, upload-time = "2025-09-22T04:03:38.05Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e0/c96cf13eccd20c9421ba910304dae0f619724dcf1702864fd59dd386404d/lxml-6.0.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa25afbadead523f7001caf0c2382afd272c315a033a7b06336da2637d92d6ed", size = 4080617, upload-time = "2025-09-22T04:03:39.835Z" }, + { url = "https://files.pythonhosted.org/packages/d5/5d/b3f03e22b3d38d6f188ef044900a9b29b2fe0aebb94625ce9fe244011d34/lxml-6.0.2-cp314-cp314-win_arm64.whl", hash = "sha256:063eccf89df5b24e361b123e257e437f9e9878f425ee9aae3144c77faf6da6d8", size = 3754930, upload-time = "2025-09-22T04:03:41.565Z" }, + { url = "https://files.pythonhosted.org/packages/5e/5c/42c2c4c03554580708fc738d13414801f340c04c3eff90d8d2d227145275/lxml-6.0.2-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:6162a86d86893d63084faaf4ff937b3daea233e3682fb4474db07395794fa80d", size = 8910380, upload-time = "2025-09-22T04:03:01.645Z" }, + { url = "https://files.pythonhosted.org/packages/bf/4f/12df843e3e10d18d468a7557058f8d3733e8b6e12401f30b1ef29360740f/lxml-6.0.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:414aaa94e974e23a3e92e7ca5b97d10c0cf37b6481f50911032c69eeb3991bba", size = 4775632, upload-time = "2025-09-22T04:03:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0c/9dc31e6c2d0d418483cbcb469d1f5a582a1cd00a1f4081953d44051f3c50/lxml-6.0.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48461bd21625458dd01e14e2c38dd0aea69addc3c4f960c30d9f59d7f93be601", size = 4975171, upload-time = "2025-09-22T04:03:05.651Z" }, + { url = "https://files.pythonhosted.org/packages/e7/2b/9b870c6ca24c841bdd887504808f0417aa9d8d564114689266f19ddf29c8/lxml-6.0.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:25fcc59afc57d527cfc78a58f40ab4c9b8fd096a9a3f964d2781ffb6eb33f4ed", size = 5110109, upload-time = "2025-09-22T04:03:07.452Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0c/4f5f2a4dd319a178912751564471355d9019e220c20d7db3fb8307ed8582/lxml-6.0.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5179c60288204e6ddde3f774a93350177e08876eaf3ab78aa3a3649d43eb7d37", size = 5041061, upload-time = "2025-09-22T04:03:09.297Z" }, + { url = "https://files.pythonhosted.org/packages/12/64/554eed290365267671fe001a20d72d14f468ae4e6acef1e179b039436967/lxml-6.0.2-cp314-cp314t-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:967aab75434de148ec80597b75062d8123cadf2943fb4281f385141e18b21338", size = 5306233, upload-time = "2025-09-22T04:03:11.651Z" }, + { url = "https://files.pythonhosted.org/packages/7a/31/1d748aa275e71802ad9722df32a7a35034246b42c0ecdd8235412c3396ef/lxml-6.0.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d100fcc8930d697c6561156c6810ab4a508fb264c8b6779e6e61e2ed5e7558f9", size = 5604739, upload-time = "2025-09-22T04:03:13.592Z" }, + { url = "https://files.pythonhosted.org/packages/8f/41/2c11916bcac09ed561adccacceaedd2bf0e0b25b297ea92aab99fd03d0fa/lxml-6.0.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ca59e7e13e5981175b8b3e4ab84d7da57993eeff53c07764dcebda0d0e64ecd", size = 5225119, upload-time = "2025-09-22T04:03:15.408Z" }, + { url = "https://files.pythonhosted.org/packages/99/05/4e5c2873d8f17aa018e6afde417c80cc5d0c33be4854cce3ef5670c49367/lxml-6.0.2-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:957448ac63a42e2e49531b9d6c0fa449a1970dbc32467aaad46f11545be9af1d", size = 4633665, upload-time = "2025-09-22T04:03:17.262Z" }, + { url = "https://files.pythonhosted.org/packages/0f/c9/dcc2da1bebd6275cdc723b515f93edf548b82f36a5458cca3578bc899332/lxml-6.0.2-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b7fc49c37f1786284b12af63152fe1d0990722497e2d5817acfe7a877522f9a9", size = 5234997, upload-time = "2025-09-22T04:03:19.14Z" }, + { url = "https://files.pythonhosted.org/packages/9c/e2/5172e4e7468afca64a37b81dba152fc5d90e30f9c83c7c3213d6a02a5ce4/lxml-6.0.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e19e0643cc936a22e837f79d01a550678da8377d7d801a14487c10c34ee49c7e", size = 5090957, upload-time = "2025-09-22T04:03:21.436Z" }, + { url = "https://files.pythonhosted.org/packages/a5/b3/15461fd3e5cd4ddcb7938b87fc20b14ab113b92312fc97afe65cd7c85de1/lxml-6.0.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:1db01e5cf14345628e0cbe71067204db658e2fb8e51e7f33631f5f4735fefd8d", size = 4764372, upload-time = "2025-09-22T04:03:23.27Z" }, + { url = "https://files.pythonhosted.org/packages/05/33/f310b987c8bf9e61c4dd8e8035c416bd3230098f5e3cfa69fc4232de7059/lxml-6.0.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:875c6b5ab39ad5291588aed6925fac99d0097af0dd62f33c7b43736043d4a2ec", size = 5634653, upload-time = "2025-09-22T04:03:25.767Z" }, + { url = "https://files.pythonhosted.org/packages/70/ff/51c80e75e0bc9382158133bdcf4e339b5886c6ee2418b5199b3f1a61ed6d/lxml-6.0.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:cdcbed9ad19da81c480dfd6dd161886db6096083c9938ead313d94b30aadf272", size = 5233795, upload-time = "2025-09-22T04:03:27.62Z" }, + { url = "https://files.pythonhosted.org/packages/56/4d/4856e897df0d588789dd844dbed9d91782c4ef0b327f96ce53c807e13128/lxml-6.0.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:80dadc234ebc532e09be1975ff538d154a7fa61ea5031c03d25178855544728f", size = 5257023, upload-time = "2025-09-22T04:03:30.056Z" }, + { url = "https://files.pythonhosted.org/packages/0f/85/86766dfebfa87bea0ab78e9ff7a4b4b45225df4b4d3b8cc3c03c5cd68464/lxml-6.0.2-cp314-cp314t-win32.whl", hash = "sha256:da08e7bb297b04e893d91087df19638dc7a6bb858a954b0cc2b9f5053c922312", size = 3911420, upload-time = "2025-09-22T04:03:32.198Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1a/b248b355834c8e32614650b8008c69ffeb0ceb149c793961dd8c0b991bb3/lxml-6.0.2-cp314-cp314t-win_amd64.whl", hash = "sha256:252a22982dca42f6155125ac76d3432e548a7625d56f5a273ee78a5057216eca", size = 4406837, upload-time = "2025-09-22T04:03:34.027Z" }, + { url = "https://files.pythonhosted.org/packages/92/aa/df863bcc39c5e0946263454aba394de8a9084dbaff8ad143846b0d844739/lxml-6.0.2-cp314-cp314t-win_arm64.whl", hash = "sha256:bb4c1847b303835d89d785a18801a883436cdfd5dc3d62947f9c49e24f0f5a2c", size = 3822205, upload-time = "2025-09-22T04:03:36.249Z" }, + { url = "https://files.pythonhosted.org/packages/e7/9c/780c9a8fce3f04690b374f72f41306866b0400b9d0fdf3e17aaa37887eed/lxml-6.0.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e748d4cf8fef2526bb2a589a417eba0c8674e29ffcb570ce2ceca44f1e567bf6", size = 3939264, upload-time = "2025-09-22T04:04:32.892Z" }, + { url = "https://files.pythonhosted.org/packages/f5/5a/1ab260c00adf645d8bf7dec7f920f744b032f69130c681302821d5debea6/lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4ddb1049fa0579d0cbd00503ad8c58b9ab34d1254c77bc6a5576d96ec7853dba", size = 4216435, upload-time = "2025-09-22T04:04:34.907Z" }, + { url = "https://files.pythonhosted.org/packages/f2/37/565f3b3d7ffede22874b6d86be1a1763d00f4ea9fc5b9b6ccb11e4ec8612/lxml-6.0.2-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cb233f9c95f83707dae461b12b720c1af9c28c2d19208e1be03387222151daf5", size = 4325913, upload-time = "2025-09-22T04:04:37.205Z" }, + { url = "https://files.pythonhosted.org/packages/22/ec/f3a1b169b2fb9d03467e2e3c0c752ea30e993be440a068b125fc7dd248b0/lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc456d04db0515ce3320d714a1eac7a97774ff0849e7718b492d957da4631dd4", size = 4269357, upload-time = "2025-09-22T04:04:39.322Z" }, + { url = "https://files.pythonhosted.org/packages/77/a2/585a28fe3e67daa1cf2f06f34490d556d121c25d500b10082a7db96e3bcd/lxml-6.0.2-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2613e67de13d619fd283d58bda40bff0ee07739f624ffee8b13b631abf33083d", size = 4412295, upload-time = "2025-09-22T04:04:41.647Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d9/a57dd8bcebd7c69386c20263830d4fa72d27e6b72a229ef7a48e88952d9a/lxml-6.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:24a8e756c982c001ca8d59e87c80c4d9dcd4d9b44a4cbeb8d9be4482c514d41d", size = 3516913, upload-time = "2025-09-22T04:04:43.602Z" }, + { url = "https://files.pythonhosted.org/packages/0b/11/29d08bc103a62c0eba8016e7ed5aeebbf1e4312e83b0b1648dd203b0e87d/lxml-6.0.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1c06035eafa8404b5cf475bb37a9f6088b0aca288d4ccc9d69389750d5543700", size = 3949829, upload-time = "2025-09-22T04:04:45.608Z" }, + { url = "https://files.pythonhosted.org/packages/12/b3/52ab9a3b31e5ab8238da241baa19eec44d2ab426532441ee607165aebb52/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7d13103045de1bdd6fe5d61802565f1a3537d70cd3abf596aa0af62761921ee", size = 4226277, upload-time = "2025-09-22T04:04:47.754Z" }, + { url = "https://files.pythonhosted.org/packages/a0/33/1eaf780c1baad88224611df13b1c2a9dfa460b526cacfe769103ff50d845/lxml-6.0.2-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a3c150a95fbe5ac91de323aa756219ef9cf7fde5a3f00e2281e30f33fa5fa4f", size = 4330433, upload-time = "2025-09-22T04:04:49.907Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c1/27428a2ff348e994ab4f8777d3a0ad510b6b92d37718e5887d2da99952a2/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:60fa43be34f78bebb27812ed90f1925ec99560b0fa1decdb7d12b84d857d31e9", size = 4272119, upload-time = "2025-09-22T04:04:51.801Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d0/3020fa12bcec4ab62f97aab026d57c2f0cfd480a558758d9ca233bb6a79d/lxml-6.0.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21c73b476d3cfe836be731225ec3421fa2f048d84f6df6a8e70433dff1376d5a", size = 4417314, upload-time = "2025-09-22T04:04:55.024Z" }, + { url = "https://files.pythonhosted.org/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, +] + +[[package]] +name = "mammoth" +version = "1.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cobble" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637", size = 53142, upload-time = "2025-09-19T10:35:20.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b", size = 54752, upload-time = "2025-09-19T10:35:18.699Z" }, +] + [[package]] name = "markdown-it-py" version = "4.0.0" @@ -4406,6 +4551,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-docx" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.1" @@ -5204,7 +5362,7 @@ wheels = [ [[package]] name = "skill-seekers" -version = "3.1.2" +version = "3.1.3" source = { editable = "." } dependencies = [ { name = "anthropic" }, @@ -5242,10 +5400,12 @@ all = [ { name = "google-generativeai" }, { name = "httpx" }, { name = "httpx-sse" }, + { name = "mammoth" }, { name = "mcp" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "openai" }, + { name = "python-docx" }, { name = "sentence-transformers" }, { name = "sse-starlette" }, { name = "starlette" }, @@ -5268,6 +5428,10 @@ azure = [ chroma = [ { name = "chromadb" }, ] +docx = [ + { name = "mammoth" }, + { name = "python-docx" }, +] embedding = [ { name = "fastapi" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -5357,6 +5521,8 @@ requires-dist = [ { name = "jsonschema", specifier = ">=4.25.1" }, { name = "langchain", specifier = ">=1.2.10" }, { name = "llama-index", specifier = ">=0.14.15" }, + { name = "mammoth", marker = "extra == 'all'", specifier = ">=1.6.0" }, + { name = "mammoth", marker = "extra == 'docx'", specifier = ">=1.6.0" }, { name = "mcp", marker = "extra == 'all'", specifier = ">=1.25,<2" }, { name = "mcp", marker = "extra == 'mcp'", specifier = ">=1.25,<2" }, { name = "networkx", specifier = ">=3.0" }, @@ -5373,6 +5539,8 @@ requires-dist = [ { name = "pygments", specifier = ">=2.19.2" }, { name = "pymupdf", specifier = ">=1.24.14" }, { name = "pytesseract", specifier = ">=0.3.13" }, + { name = "python-docx", marker = "extra == 'all'", specifier = ">=1.1.0" }, + { name = "python-docx", marker = "extra == 'docx'", specifier = ">=1.1.0" }, { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "pyyaml", specifier = ">=6.0" }, { name = "requests", specifier = ">=2.32.5" }, @@ -5395,7 +5563,7 @@ requires-dist = [ { name = "weaviate-client", marker = "extra == 'rag-upload'", specifier = ">=3.25.0" }, { name = "weaviate-client", marker = "extra == 'weaviate'", specifier = ">=3.25.0" }, ] -provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "chroma", "weaviate", "sentence-transformers", "rag-upload", "all-cloud", "embedding", "all"] +provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "docx", "chroma", "weaviate", "sentence-transformers", "rag-upload", "all-cloud", "embedding", "all"] [package.metadata.requires-dev] dev = [