#!/usr/bin/env python3 """ Tests for chunking integration in package command and RAG adaptors. Tests that RAGChunker is properly integrated into: - package_skill.py command - base_adaptor._maybe_chunk_content() - All 7 RAG adaptors (langchain, llama-index, haystack, weaviate, chroma, faiss, qdrant) """ import pytest import json from pathlib import Path from skill_seekers.cli.adaptors import get_adaptor def create_test_skill(tmp_path: Path, large_doc: bool = False) -> Path: """ Create a test skill directory for chunking tests. Args: tmp_path: Temporary directory large_doc: If True, create a large document (>512 tokens) Returns: Path to skill directory """ skill_dir = tmp_path / "test_skill" skill_dir.mkdir() # Create SKILL.md if large_doc: # Create ~10KB document (>512 tokens estimate: ~2500 tokens) content = "# Test Skill\n\n" + ("Lorem ipsum dolor sit amet. " * 2000) else: # Small document (<512 tokens) content = "# Test Skill\n\nThis is a small test document." (skill_dir / "SKILL.md").write_text(content) # Create references directory refs_dir = skill_dir / "references" refs_dir.mkdir() # Create a reference file if large_doc: ref_content = "# API Reference\n\n" + ("Function details here. " * 1000) else: ref_content = "# API Reference\n\nSome API documentation." (refs_dir / "api_reference.md").write_text(ref_content) return skill_dir class TestChunkingDisabledByDefault: """Test that chunking is disabled by default.""" def test_langchain_no_chunking_default(self, tmp_path): """Test that LangChain doesn't chunk by default.""" skill_dir = create_test_skill(tmp_path, large_doc=True) adaptor = get_adaptor("langchain") package_path = adaptor.package(skill_dir, tmp_path) with open(package_path) as f: data = json.load(f) # Should be exactly 2 documents (SKILL.md + 1 reference) assert len(data) == 2, f"Expected 2 docs, got {len(data)}" # No chunking metadata for doc in data: assert "is_chunked" not in doc["metadata"] assert "chunk_index" not in doc["metadata"] class TestChunkingEnabled: """Test that chunking works when enabled.""" def test_langchain_chunking_enabled(self, tmp_path): """Test that LangChain chunks large documents when enabled.""" skill_dir = create_test_skill(tmp_path, large_doc=True) adaptor = get_adaptor("langchain") package_path = adaptor.package( skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512 ) with open(package_path) as f: data = json.load(f) # Should have multiple chunks (more than 2 docs) assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs" # Check for chunking metadata chunked_docs = [doc for doc in data if doc["metadata"].get("is_chunked")] assert len(chunked_docs) > 0, "Should have chunked documents" # Verify chunk metadata structure for doc in chunked_docs: assert "chunk_index" in doc["metadata"] assert "total_chunks" in doc["metadata"] assert "chunk_id" in doc["metadata"] def test_chunking_preserves_small_docs(self, tmp_path): """Test that small documents are not chunked.""" skill_dir = create_test_skill(tmp_path, large_doc=False) adaptor = get_adaptor("langchain") package_path = adaptor.package( skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512 ) with open(package_path) as f: data = json.load(f) # Small docs should not be chunked assert len(data) == 2, "Small docs should not be chunked" for doc in data: assert "is_chunked" not in doc["metadata"] class TestCodeBlockPreservation: """Test that code blocks are preserved during chunking.""" def test_preserve_code_blocks(self, tmp_path): """Test that code blocks are not split during chunking.""" skill_dir = tmp_path / "test_skill" skill_dir.mkdir() # Create document with code block content = """# Test Some intro text that needs to be here for context. ```python def example_function(): # This code block should not be split x = 1 y = 2 z = 3 return x + y + z ``` More content after code block. """ + ("Lorem ipsum dolor sit amet. " * 1000) # Make it large enough to force chunking (skill_dir / "SKILL.md").write_text(content) # Create references dir (required) (skill_dir / "references").mkdir() adaptor = get_adaptor("langchain") package_path = adaptor.package( skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=200, # Small chunks to force splitting preserve_code_blocks=True, ) with open(package_path) as f: data = json.load(f) # Find chunks with code block code_chunks = [doc for doc in data if "```python" in doc["page_content"]] # Code block should be in at least one chunk assert len(code_chunks) >= 1, "Code block should be preserved" # Code block should be complete (opening and closing backticks) for chunk in code_chunks: content = chunk["page_content"] if "```python" in content: # Should also have closing backticks assert content.count("```") >= 2, "Code block should be complete" class TestAutoChunkingForRAGPlatforms: """Test that chunking is auto-enabled for RAG platforms.""" @pytest.mark.parametrize( "platform", [ "langchain", # Add others after they're updated: # 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant' ], ) def test_rag_platforms_auto_chunk(self, platform, tmp_path): """Test that RAG platforms auto-enable chunking.""" skill_dir = create_test_skill(tmp_path, large_doc=True) # Import package_skill function from skill_seekers.cli.package_skill import package_skill # Package with RAG platform (should auto-enable chunking) success, package_path = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target=platform, enable_chunking=False, # Explicitly disabled, but should be auto-enabled ) assert success, f"Packaging failed for {platform}" assert package_path.exists(), f"Package not created for {platform}" # Verify chunking occurred with open(package_path) as f: data = json.load(f) # Should have multiple documents/chunks if isinstance(data, list): assert len(data) > 2, f"{platform}: Should auto-chunk large docs" elif isinstance(data, dict) and "documents" in data: assert len(data["documents"]) > 2, f"{platform}: Should auto-chunk large docs" class TestBaseAdaptorChunkingHelper: """Test the base adaptor's _maybe_chunk_content method.""" def test_maybe_chunk_content_disabled(self): """Test that _maybe_chunk_content returns single chunk when disabled.""" from skill_seekers.cli.adaptors.langchain import LangChainAdaptor adaptor = LangChainAdaptor() content = "Test content " * 1000 # Large content metadata = {"source": "test"} chunks = adaptor._maybe_chunk_content(content, metadata, enable_chunking=False) # Should return single chunk assert len(chunks) == 1 assert chunks[0][0] == content assert chunks[0][1] == metadata def test_maybe_chunk_content_small_doc(self): """Test that small docs are not chunked even when enabled.""" from skill_seekers.cli.adaptors.langchain import LangChainAdaptor adaptor = LangChainAdaptor() content = "Small test content" # <512 tokens metadata = {"source": "test"} chunks = adaptor._maybe_chunk_content( content, metadata, enable_chunking=True, chunk_max_tokens=512 ) # Should return single chunk assert len(chunks) == 1 def test_maybe_chunk_content_large_doc(self): """Test that large docs are chunked when enabled.""" from skill_seekers.cli.adaptors.langchain import LangChainAdaptor adaptor = LangChainAdaptor() content = "Lorem ipsum dolor sit amet. " * 2000 # >512 tokens metadata = {"source": "test", "file": "test.md"} chunks = adaptor._maybe_chunk_content( content, metadata, enable_chunking=True, chunk_max_tokens=512, preserve_code_blocks=True, source_file="test.md", ) # Should return multiple chunks assert len(chunks) > 1, f"Large doc should be chunked, got {len(chunks)} chunks" # Verify chunk metadata for chunk_text, chunk_meta in chunks: assert isinstance(chunk_text, str) assert isinstance(chunk_meta, dict) assert chunk_meta["is_chunked"] assert "chunk_index" in chunk_meta assert "chunk_id" in chunk_meta # Original metadata preserved assert chunk_meta["source"] == "test" assert chunk_meta["file"] == "test.md" class TestChunkingCLIIntegration: """Test chunking via CLI arguments.""" def test_chunk_flag(self, tmp_path): """Test --chunk-for-rag flag enables chunking.""" from skill_seekers.cli.package_skill import package_skill skill_dir = create_test_skill(tmp_path, large_doc=True) success, package_path = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, # --chunk-for-rag flag chunk_max_tokens=512, preserve_code_blocks=True, ) assert success assert package_path.exists() with open(package_path) as f: data = json.load(f) # Should have chunked documents assert len(data) > 2 def test_chunk_tokens_parameter(self, tmp_path): """Test --chunk-tokens parameter controls chunk size.""" from skill_seekers.cli.package_skill import package_skill skill_dir = create_test_skill(tmp_path, large_doc=True) # Package with small chunk size success, package_path = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, chunk_max_tokens=256, # Small chunks preserve_code_blocks=True, ) assert success with open(package_path) as f: data_small = json.load(f) # Package with large chunk size success, package_path2 = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, chunk_max_tokens=1024, # Large chunks preserve_code_blocks=True, ) assert success with open(package_path2) as f: data_large = json.load(f) # Small chunk size should produce more chunks assert len(data_small) > len(data_large), ( f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})" ) def test_chunk_overlap_tokens_parameter(self, tmp_path): """Test --chunk-overlap-tokens controls RAGChunker overlap.""" from skill_seekers.cli.package_skill import package_skill skill_dir = create_test_skill(tmp_path, large_doc=True) # Package with default overlap (50) success, package_path = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, chunk_max_tokens=256, chunk_overlap_tokens=50, ) assert success assert package_path.exists() with open(package_path) as f: data_default = json.load(f) # Package with large overlap (128) success2, package_path2 = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, chunk_max_tokens=256, chunk_overlap_tokens=128, ) assert success2 assert package_path2.exists() with open(package_path2) as f: data_large_overlap = json.load(f) # Large overlap should produce more chunks (more overlap = more chunks) assert len(data_large_overlap) >= len(data_default), ( f"Large overlap ({len(data_large_overlap)}) should produce >= chunks than default ({len(data_default)})" ) def test_chunk_overlap_scales_with_chunk_size(self, tmp_path): """Test that overlap auto-scales when chunk_tokens is non-default but overlap is default.""" from skill_seekers.cli.adaptors.base import ( DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS, ) adaptor = get_adaptor("langchain") skill_dir = create_test_skill(tmp_path, large_doc=True) adaptor._build_skill_metadata(skill_dir) content = (skill_dir / "SKILL.md").read_text() # With default chunk size (512) and default overlap (50), overlap should be 50 chunks_default = adaptor._maybe_chunk_content( content, {"source": "test"}, enable_chunking=True, chunk_max_tokens=DEFAULT_CHUNK_TOKENS, chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS, ) # With large chunk size (1024) and default overlap (50), # overlap should auto-scale to max(50, 1024//10) = 102 chunks_large = adaptor._maybe_chunk_content( content, {"source": "test"}, enable_chunking=True, chunk_max_tokens=1024, chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS, ) # Both should produce valid chunks assert len(chunks_default) > 1 assert len(chunks_large) >= 1 def test_preserve_code_blocks_flag(self, tmp_path): """Test --no-preserve-code-blocks parameter is accepted.""" from skill_seekers.cli.package_skill import package_skill skill_dir = create_test_skill(tmp_path, large_doc=True) # Package with code block preservation disabled success, package_path = package_skill( skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, target="langchain", enable_chunking=True, chunk_max_tokens=256, preserve_code_blocks=False, ) assert success assert package_path.exists() if __name__ == "__main__": pytest.main([__file__, "-v"])