Files
skill-seekers-reference/tests/test_chunking_integration.py
yusyus 51787e57bc style: Fix 411 ruff lint issues (Kimi's issue #4)
Auto-fixed lint issues with ruff --fix and --unsafe-fixes:

Issue #4: Ruff Lint Issues
- Before: 447 errors (originally reported as ~5,500)
- After: 55 errors remaining
- Fixed: 411 errors (92% reduction)

Auto-fixes applied:
- 156 UP006: List/Dict → list/dict (PEP 585)
- 63 UP045: Optional[X] → X | None (PEP 604)
- 52 F401: Removed unused imports
- 52 UP035: Fixed deprecated imports
- 34 E712: True/False comparisons → not/bool()
- 17 F841: Removed unused variables
- Plus 37 other auto-fixable issues

Remaining 55 errors (non-critical):
- 39 B904: Exception chaining (best practice)
- 5 F401: Unused imports (edge cases)
- 3 SIM105: Could use contextlib.suppress
- 8 other minor style issues

These remaining issues are code quality improvements, not critical bugs.

Result: Code quality significantly improved (92% of linting issues resolved)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-08 12:46:38 +03:00

376 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Tests for chunking integration in package command and RAG adaptors.
Tests that RAGChunker is properly integrated into:
- package_skill.py command
- base_adaptor._maybe_chunk_content()
- All 7 RAG adaptors (langchain, llama-index, haystack, weaviate, chroma, faiss, qdrant)
"""
import pytest
import json
from pathlib import Path
from skill_seekers.cli.adaptors import get_adaptor
def create_test_skill(tmp_path: Path, large_doc: bool = False) -> Path:
"""
Create a test skill directory for chunking tests.
Args:
tmp_path: Temporary directory
large_doc: If True, create a large document (>512 tokens)
Returns:
Path to skill directory
"""
skill_dir = tmp_path / "test_skill"
skill_dir.mkdir()
# Create SKILL.md
if large_doc:
# Create ~10KB document (>512 tokens estimate: ~2500 tokens)
content = "# Test Skill\n\n" + ("Lorem ipsum dolor sit amet. " * 2000)
else:
# Small document (<512 tokens)
content = "# Test Skill\n\nThis is a small test document."
(skill_dir / "SKILL.md").write_text(content)
# Create references directory
refs_dir = skill_dir / "references"
refs_dir.mkdir()
# Create a reference file
if large_doc:
ref_content = "# API Reference\n\n" + ("Function details here. " * 1000)
else:
ref_content = "# API Reference\n\nSome API documentation."
(refs_dir / "api_reference.md").write_text(ref_content)
return skill_dir
class TestChunkingDisabledByDefault:
"""Test that chunking is disabled by default."""
def test_langchain_no_chunking_default(self, tmp_path):
"""Test that LangChain doesn't chunk by default."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
adaptor = get_adaptor('langchain')
package_path = adaptor.package(skill_dir, tmp_path)
with open(package_path) as f:
data = json.load(f)
# Should be exactly 2 documents (SKILL.md + 1 reference)
assert len(data) == 2, f"Expected 2 docs, got {len(data)}"
# No chunking metadata
for doc in data:
assert 'is_chunked' not in doc['metadata']
assert 'chunk_index' not in doc['metadata']
class TestChunkingEnabled:
"""Test that chunking works when enabled."""
def test_langchain_chunking_enabled(self, tmp_path):
"""Test that LangChain chunks large documents when enabled."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
adaptor = get_adaptor('langchain')
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=512
)
with open(package_path) as f:
data = json.load(f)
# Should have multiple chunks (more than 2 docs)
assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs"
# Check for chunking metadata
chunked_docs = [doc for doc in data if doc['metadata'].get('is_chunked')]
assert len(chunked_docs) > 0, "Should have chunked documents"
# Verify chunk metadata structure
for doc in chunked_docs:
assert 'chunk_index' in doc['metadata']
assert 'total_chunks' in doc['metadata']
assert 'chunk_id' in doc['metadata']
def test_chunking_preserves_small_docs(self, tmp_path):
"""Test that small documents are not chunked."""
skill_dir = create_test_skill(tmp_path, large_doc=False)
adaptor = get_adaptor('langchain')
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=512
)
with open(package_path) as f:
data = json.load(f)
# Small docs should not be chunked
assert len(data) == 2, "Small docs should not be chunked"
for doc in data:
assert 'is_chunked' not in doc['metadata']
class TestCodeBlockPreservation:
"""Test that code blocks are preserved during chunking."""
def test_preserve_code_blocks(self, tmp_path):
"""Test that code blocks are not split during chunking."""
skill_dir = tmp_path / "test_skill"
skill_dir.mkdir()
# Create document with code block
content = """# Test
Some intro text that needs to be here for context.
```python
def example_function():
# This code block should not be split
x = 1
y = 2
z = 3
return x + y + z
```
More content after code block.
""" + ("Lorem ipsum dolor sit amet. " * 1000) # Make it large enough to force chunking
(skill_dir / "SKILL.md").write_text(content)
# Create references dir (required)
(skill_dir / "references").mkdir()
adaptor = get_adaptor('langchain')
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=200, # Small chunks to force splitting
preserve_code_blocks=True
)
with open(package_path) as f:
data = json.load(f)
# Find chunks with code block
code_chunks = [
doc for doc in data
if '```python' in doc['page_content']
]
# Code block should be in at least one chunk
assert len(code_chunks) >= 1, "Code block should be preserved"
# Code block should be complete (opening and closing backticks)
for chunk in code_chunks:
content = chunk['page_content']
if '```python' in content:
# Should also have closing backticks
assert content.count('```') >= 2, "Code block should be complete"
class TestAutoChunkingForRAGPlatforms:
"""Test that chunking is auto-enabled for RAG platforms."""
@pytest.mark.parametrize("platform", [
'langchain',
# Add others after they're updated:
# 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
])
def test_rag_platforms_auto_chunk(self, platform, tmp_path):
"""Test that RAG platforms auto-enable chunking."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
# Import package_skill function
from skill_seekers.cli.package_skill import package_skill
# Package with RAG platform (should auto-enable chunking)
success, package_path = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target=platform,
enable_chunking=False # Explicitly disabled, but should be auto-enabled
)
assert success, f"Packaging failed for {platform}"
assert package_path.exists(), f"Package not created for {platform}"
# Verify chunking occurred
with open(package_path) as f:
data = json.load(f)
# Should have multiple documents/chunks
if isinstance(data, list):
assert len(data) > 2, f"{platform}: Should auto-chunk large docs"
elif isinstance(data, dict) and 'documents' in data:
assert len(data['documents']) > 2, f"{platform}: Should auto-chunk large docs"
class TestBaseAdaptorChunkingHelper:
"""Test the base adaptor's _maybe_chunk_content method."""
def test_maybe_chunk_content_disabled(self):
"""Test that _maybe_chunk_content returns single chunk when disabled."""
from skill_seekers.cli.adaptors.langchain import LangChainAdaptor
adaptor = LangChainAdaptor()
content = "Test content " * 1000 # Large content
metadata = {"source": "test"}
chunks = adaptor._maybe_chunk_content(
content,
metadata,
enable_chunking=False
)
# Should return single chunk
assert len(chunks) == 1
assert chunks[0][0] == content
assert chunks[0][1] == metadata
def test_maybe_chunk_content_small_doc(self):
"""Test that small docs are not chunked even when enabled."""
from skill_seekers.cli.adaptors.langchain import LangChainAdaptor
adaptor = LangChainAdaptor()
content = "Small test content" # <512 tokens
metadata = {"source": "test"}
chunks = adaptor._maybe_chunk_content(
content,
metadata,
enable_chunking=True,
chunk_max_tokens=512
)
# Should return single chunk
assert len(chunks) == 1
def test_maybe_chunk_content_large_doc(self):
"""Test that large docs are chunked when enabled."""
from skill_seekers.cli.adaptors.langchain import LangChainAdaptor
adaptor = LangChainAdaptor()
content = "Lorem ipsum dolor sit amet. " * 2000 # >512 tokens
metadata = {"source": "test", "file": "test.md"}
chunks = adaptor._maybe_chunk_content(
content,
metadata,
enable_chunking=True,
chunk_max_tokens=512,
preserve_code_blocks=True,
source_file="test.md"
)
# Should return multiple chunks
assert len(chunks) > 1, f"Large doc should be chunked, got {len(chunks)} chunks"
# Verify chunk metadata
for chunk_text, chunk_meta in chunks:
assert isinstance(chunk_text, str)
assert isinstance(chunk_meta, dict)
assert chunk_meta['is_chunked']
assert 'chunk_index' in chunk_meta
assert 'chunk_id' in chunk_meta
# Original metadata preserved
assert chunk_meta['source'] == 'test'
assert chunk_meta['file'] == 'test.md'
class TestChunkingCLIIntegration:
"""Test chunking via CLI arguments."""
def test_chunk_flag(self, tmp_path):
"""Test --chunk flag enables chunking."""
from skill_seekers.cli.package_skill import package_skill
skill_dir = create_test_skill(tmp_path, large_doc=True)
success, package_path = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
enable_chunking=True, # --chunk flag
chunk_max_tokens=512,
preserve_code_blocks=True
)
assert success
assert package_path.exists()
with open(package_path) as f:
data = json.load(f)
# Should have chunked documents
assert len(data) > 2
def test_chunk_tokens_parameter(self, tmp_path):
"""Test --chunk-tokens parameter controls chunk size."""
from skill_seekers.cli.package_skill import package_skill
skill_dir = create_test_skill(tmp_path, large_doc=True)
# Package with small chunk size
success, package_path = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
enable_chunking=True,
chunk_max_tokens=256, # Small chunks
preserve_code_blocks=True
)
assert success
with open(package_path) as f:
data_small = json.load(f)
# Package with large chunk size
success, package_path2 = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
enable_chunking=True,
chunk_max_tokens=1024, # Large chunks
preserve_code_blocks=True
)
assert success
with open(package_path2) as f:
data_large = json.load(f)
# Small chunk size should produce more chunks
assert len(data_small) > len(data_large), \
f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
if __name__ == '__main__':
pytest.main([__file__, '-v'])