diff --git a/src/skill_seekers/cli/enhance_skill_local.py b/src/skill_seekers/cli/enhance_skill_local.py index 01e3822..a4d2ee2 100644 --- a/src/skill_seekers/cli/enhance_skill_local.py +++ b/src/skill_seekers/cli/enhance_skill_local.py @@ -367,7 +367,7 @@ class LocalSkillEnhancer: if line.startswith("#"): # Found heading - keep it and next 3 lines chunk = lines[i : min(i + 4, len(lines))] - chunk_chars = sum(len(l) for l in chunk) + chunk_chars = sum(len(line_text) for line_text in chunk) if current_chars + chunk_chars > max_chars: break result.extend(chunk) diff --git a/src/skill_seekers/cli/word_scraper.py b/src/skill_seekers/cli/word_scraper.py index 33d8666..1cd3375 100644 --- a/src/skill_seekers/cli/word_scraper.py +++ b/src/skill_seekers/cli/word_scraper.py @@ -79,7 +79,9 @@ class WordToSkillConverter: self.config = config self.name = config["name"] self.docx_path = config.get("docx_path", "") - self.description = config.get("description") or f"Use when referencing {self.name} documentation" + self.description = ( + config.get("description") or f"Use when referencing {self.name} documentation" + ) # Paths self.skill_dir = f"output/{self.name}" @@ -110,9 +112,7 @@ class WordToSkillConverter: raise FileNotFoundError(f"Word document not found: {self.docx_path}") if not self.docx_path.lower().endswith(".docx"): - raise ValueError( - f"Not a Word document (expected .docx): {self.docx_path}" - ) + raise ValueError(f"Not a Word document (expected .docx): {self.docx_path}") # --- Extract metadata via python-docx --- doc = python_docx.Document(self.docx_path) @@ -733,12 +733,13 @@ class WordToSkillConverter: # HTML-to-sections helper (module-level for clarity) # --------------------------------------------------------------------------- + def _build_section( section_number: int, heading: str | None, heading_level: str | None, elements: list, - doc, + doc, # noqa: ARG001 ) -> dict: """Build a section dict from a list of BeautifulSoup elements. @@ -774,10 +775,7 @@ def _build_section( # Code blocks if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None): code_elem = elem.find("code") if tag == "pre" else elem - if code_elem: - code_text = code_elem.get_text() - else: - code_text = elem.get_text() + code_text = code_elem.get_text() if code_elem else elem.get_text() code_text = code_text.strip() if code_text: @@ -961,7 +959,8 @@ def main(): name = Path(args.from_json).stem.replace("_extracted", "") config = { "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) or f"Use when referencing {name} documentation", + "description": getattr(args, "description", None) + or f"Use when referencing {name} documentation", } try: converter = WordToSkillConverter(config) @@ -1049,6 +1048,7 @@ def main(): except Exception as e: print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr) import traceback + traceback.print_exc() sys.exit(1) diff --git a/tests/test_chunking_integration.py b/tests/test_chunking_integration.py index e9068ba..b62eb24 100644 --- a/tests/test_chunking_integration.py +++ b/tests/test_chunking_integration.py @@ -358,7 +358,6 @@ class TestChunkingCLIIntegration: f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})" ) - def test_chunk_overlap_tokens_parameter(self, tmp_path): """Test --chunk-overlap-tokens controls RAGChunker overlap.""" from skill_seekers.cli.package_skill import package_skill @@ -406,17 +405,21 @@ class TestChunkingCLIIntegration: def test_chunk_overlap_scales_with_chunk_size(self, tmp_path): """Test that overlap auto-scales when chunk_tokens is non-default but overlap is default.""" - from skill_seekers.cli.adaptors.base import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS + from skill_seekers.cli.adaptors.base import ( + DEFAULT_CHUNK_TOKENS, + DEFAULT_CHUNK_OVERLAP_TOKENS, + ) adaptor = get_adaptor("langchain") skill_dir = create_test_skill(tmp_path, large_doc=True) - metadata = adaptor._build_skill_metadata(skill_dir) + adaptor._build_skill_metadata(skill_dir) content = (skill_dir / "SKILL.md").read_text() # With default chunk size (512) and default overlap (50), overlap should be 50 chunks_default = adaptor._maybe_chunk_content( - content, {"source": "test"}, + content, + {"source": "test"}, enable_chunking=True, chunk_max_tokens=DEFAULT_CHUNK_TOKENS, chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS, @@ -425,7 +428,8 @@ class TestChunkingCLIIntegration: # With large chunk size (1024) and default overlap (50), # overlap should auto-scale to max(50, 1024//10) = 102 chunks_large = adaptor._maybe_chunk_content( - content, {"source": "test"}, + content, + {"source": "test"}, enable_chunking=True, chunk_max_tokens=1024, chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS, diff --git a/tests/test_create_arguments.py b/tests/test_create_arguments.py index 249348b..fde225f 100644 --- a/tests/test_create_arguments.py +++ b/tests/test_create_arguments.py @@ -131,7 +131,9 @@ class TestArgumentHelpers: """Should return set of universal argument names.""" names = get_universal_argument_names() assert isinstance(names, set) - assert len(names) == 19 # Phase 2: added 4 workflow arguments + local_repo_path + doc_version + assert ( + len(names) == 19 + ) # Phase 2: added 4 workflow arguments + local_repo_path + doc_version assert "name" in names assert "enhance_level" in names # Phase 1: consolidated flag assert "enhance_workflow" in names # Phase 2: workflow support diff --git a/tests/test_pinecone_adaptor.py b/tests/test_pinecone_adaptor.py index 7a81400..0453d9c 100644 --- a/tests/test_pinecone_adaptor.py +++ b/tests/test_pinecone_adaptor.py @@ -4,11 +4,10 @@ Tests for Pinecone adaptor and doc_version metadata flow. """ import json -from pathlib import Path import pytest -from skill_seekers.cli.adaptors.base import SkillAdaptor, SkillMetadata +from skill_seekers.cli.adaptors.base import SkillMetadata # --------------------------------------------------------------------------- @@ -40,9 +39,7 @@ Get started quickly. refs_dir = skill_dir / "references" refs_dir.mkdir() - (refs_dir / "api_reference.md").write_text( - "# API Reference\n\nSome API docs.\n" - ) + (refs_dir / "api_reference.md").write_text("# API Reference\n\nSome API docs.\n") (refs_dir / "getting_started.md").write_text( "# Getting Started\n\nSome getting started docs.\n" ) @@ -330,13 +327,17 @@ class TestPineconeAdaptor: if vectors is None: vectors = [{"id": "a", "metadata": {"text": "hello world"}}] pkg = tmp_path / "test-pinecone.json" - pkg.write_text(json.dumps({ - "vectors": vectors, - "index_name": "test", - "namespace": "test", - "metric": "cosine", - "dimension": 1536, - })) + pkg.write_text( + json.dumps( + { + "vectors": vectors, + "index_name": "test", + "namespace": "test", + "metric": "cosine", + "dimension": 1536, + } + ) + ) return pkg def test_upload_success_has_url_key(self, tmp_path, monkeypatch): @@ -346,7 +347,8 @@ class TestPineconeAdaptor: adaptor = PineconeAdaptor() mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch) monkeypatch.setattr( - adaptor, "_generate_openai_embeddings", + adaptor, + "_generate_openai_embeddings", lambda docs: [[0.0] * 1536] * len(docs), ) pkg = self._make_package(tmp_path) @@ -364,13 +366,16 @@ class TestPineconeAdaptor: adaptor = PineconeAdaptor() mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch) monkeypatch.setattr( - adaptor, "_generate_st_embeddings", + adaptor, + "_generate_st_embeddings", lambda docs: [[0.0] * 384] * len(docs), ) pkg = self._make_package(tmp_path) result = adaptor.upload( - pkg, api_key="fake-key", embedding_function="sentence-transformers", + pkg, + api_key="fake-key", + embedding_function="sentence-transformers", ) assert result["success"] is True # Verify create_index was called with dimension=384 @@ -385,13 +390,16 @@ class TestPineconeAdaptor: adaptor = PineconeAdaptor() mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch) monkeypatch.setattr( - adaptor, "_generate_openai_embeddings", + adaptor, + "_generate_openai_embeddings", lambda docs: [[0.0] * 1536] * len(docs), ) pkg = self._make_package(tmp_path) result = adaptor.upload( - pkg, api_key="fake-key", embedding_function="openai", + pkg, + api_key="fake-key", + embedding_function="openai", ) assert result["success"] is True mock_pc.create_index.assert_called_once() @@ -405,7 +413,7 @@ class TestPineconeAdaptor: adaptor = PineconeAdaptor() mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch) - def fail_embeddings(docs): + def fail_embeddings(_docs): raise RuntimeError("OPENAI_API_KEY not set") monkeypatch.setattr(adaptor, "_generate_openai_embeddings", fail_embeddings) @@ -423,13 +431,17 @@ class TestPineconeAdaptor: adaptor = PineconeAdaptor() mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch) monkeypatch.setattr( - adaptor, "_generate_openai_embeddings", + adaptor, + "_generate_openai_embeddings", lambda docs: [[0.0] * 768] * len(docs), ) pkg = self._make_package(tmp_path) result = adaptor.upload( - pkg, api_key="fake-key", embedding_function="openai", dimension=768, + pkg, + api_key="fake-key", + embedding_function="openai", + dimension=768, ) assert result["success"] is True mock_pc.create_index.assert_called_once() diff --git a/tests/test_upload_integration.py b/tests/test_upload_integration.py index 75aa019..fc19357 100644 --- a/tests/test_upload_integration.py +++ b/tests/test_upload_integration.py @@ -160,7 +160,10 @@ class TestEmbeddingMethodInheritance: assert hasattr(adaptor, "_generate_openai_embeddings") # Verify it's the base class method, not a local override from skill_seekers.cli.adaptors.base import SkillAdaptor - assert adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + + assert ( + adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + ) def test_weaviate_inherits_both_embedding_methods(self): """Test weaviate adaptor gets both embedding methods from base.""" @@ -168,7 +171,10 @@ class TestEmbeddingMethodInheritance: assert hasattr(adaptor, "_generate_openai_embeddings") assert hasattr(adaptor, "_generate_st_embeddings") from skill_seekers.cli.adaptors.base import SkillAdaptor - assert adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + + assert ( + adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + ) assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings def test_pinecone_inherits_both_embedding_methods(self): @@ -177,7 +183,10 @@ class TestEmbeddingMethodInheritance: assert hasattr(adaptor, "_generate_openai_embeddings") assert hasattr(adaptor, "_generate_st_embeddings") from skill_seekers.cli.adaptors.base import SkillAdaptor - assert adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + + assert ( + adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings + ) assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings diff --git a/tests/test_word_scraper.py b/tests/test_word_scraper.py index cfc14ef..2b13f43 100644 --- a/tests/test_word_scraper.py +++ b/tests/test_word_scraper.py @@ -31,8 +31,9 @@ except ImportError: WORD_AVAILABLE = False -def _make_sample_extracted_data(num_sections=2, include_code=False, include_tables=False, - include_images=False): +def _make_sample_extracted_data( + num_sections=2, include_code=False, include_tables=False, include_images=False +): """Helper to build a minimal extracted_data dict for testing.""" mock_image_bytes = ( b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01" @@ -54,23 +55,29 @@ def _make_sample_extracted_data(num_sections=2, include_code=False, include_tabl } if include_code: section["code_samples"] = [ - {"code": f"def hello_{i}():\n return 'world'", "language": "python", - "quality_score": 7.5} + { + "code": f"def hello_{i}():\n return 'world'", + "language": "python", + "quality_score": 7.5, + } ] if include_tables: section["tables"] = [ {"headers": ["Col A", "Col B"], "rows": [["val1", "val2"], ["val3", "val4"]]} ] if include_images: - section["images"] = [ - {"index": 0, "data": mock_image_bytes, "width": 100, "height": 80} - ] + section["images"] = [{"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}] pages.append(section) return { "source_file": "test.docx", - "metadata": {"title": "Test Doc", "author": "Test Author", "created": "", "modified": "", - "subject": ""}, + "metadata": { + "title": "Test Doc", + "author": "Test Author", + "created": "", + "modified": "", + "subject": "", + }, "total_sections": num_sections, "total_code_blocks": num_sections if include_code else 0, "total_images": num_sections if include_images else 0, @@ -86,6 +93,7 @@ class TestWordToSkillConverterInit(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -131,6 +139,7 @@ class TestWordToSkillConverterInit(unittest.TestCase): def test_name_auto_detected_from_filename(self): """Test name can be extracted from filename via infer_description_from_word.""" from skill_seekers.cli.word_scraper import infer_description_from_word + desc = infer_description_from_word({}, name="my_doc") self.assertIn("my_doc", desc) @@ -142,6 +151,7 @@ class TestWordCategorization(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -175,10 +185,22 @@ class TestWordCategorization(unittest.TestCase): converter.docx_path = "" converter.extracted_data = { "pages": [ - {"section_number": 1, "heading": "API Reference", "text": "api reference docs", - "code_samples": [], "tables": [], "images": []}, - {"section_number": 2, "heading": "Getting Started", "text": "getting started guide", - "code_samples": [], "tables": [], "images": []}, + { + "section_number": 1, + "heading": "API Reference", + "text": "api reference docs", + "code_samples": [], + "tables": [], + "images": [], + }, + { + "section_number": 2, + "heading": "Getting Started", + "text": "getting started guide", + "code_samples": [], + "tables": [], + "images": [], + }, ] } @@ -205,6 +227,7 @@ class TestWordSkillBuilding(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -297,6 +320,7 @@ class TestWordCodeBlocks(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -351,6 +375,7 @@ class TestWordTables(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -393,6 +418,7 @@ class TestWordImages(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -434,6 +460,7 @@ class TestWordErrorHandling(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp() @@ -496,6 +523,7 @@ class TestWordJSONWorkflow(unittest.TestCase): if not WORD_AVAILABLE: self.skipTest("mammoth and python-docx not installed") from skill_seekers.cli.word_scraper import WordToSkillConverter + self.WordToSkillConverter = WordToSkillConverter self.temp_dir = tempfile.mkdtemp()