#!/usr/bin/env python3
"""
Tests for chunking integration in package command and RAG adaptors.

Tests that RAGChunker is properly integrated into:
- package_skill.py command
- base_adaptor._maybe_chunk_content()
- All 7 RAG adaptors (langchain, llama-index, haystack, weaviate, chroma, faiss, qdrant)
"""

import pytest
import json
from pathlib import Path
from skill_seekers.cli.adaptors import get_adaptor


def create_test_skill(tmp_path: Path, large_doc: bool = False) -> Path:
    """
    Create a test skill directory for chunking tests.

    Args:
        tmp_path: Temporary directory
        large_doc: If True, create a large document (>512 tokens)

    Returns:
        Path to skill directory
    """
    skill_dir = tmp_path / "test_skill"
    skill_dir.mkdir()

    # Create SKILL.md
    if large_doc:
        # Create ~10KB document (>512 tokens estimate: ~2500 tokens)
        content = "# Test Skill\n\n" + ("Lorem ipsum dolor sit amet. " * 2000)
    else:
        # Small document (<512 tokens)
        content = "# Test Skill\n\nThis is a small test document."

    (skill_dir / "SKILL.md").write_text(content)

    # Create references directory
    refs_dir = skill_dir / "references"
    refs_dir.mkdir()

    # Create a reference file
    if large_doc:
        ref_content = "# API Reference\n\n" + ("Function details here. " * 1000)
    else:
        ref_content = "# API Reference\n\nSome API documentation."

    (refs_dir / "api_reference.md").write_text(ref_content)

    return skill_dir


class TestChunkingDisabledByDefault:
    """Test that chunking is disabled by default."""

    def test_langchain_no_chunking_default(self, tmp_path):
        """Test that LangChain doesn't chunk by default."""
        skill_dir = create_test_skill(tmp_path, large_doc=True)

        adaptor = get_adaptor("langchain")
        package_path = adaptor.package(skill_dir, tmp_path)

        with open(package_path) as f:
            data = json.load(f)

        # Should be exactly 2 documents (SKILL.md + 1 reference)
        assert len(data) == 2, f"Expected 2 docs, got {len(data)}"

        # No chunking metadata
        for doc in data:
            assert "is_chunked" not in doc["metadata"]
            assert "chunk_index" not in doc["metadata"]


class TestChunkingEnabled:
    """Test that chunking works when enabled."""

    def test_langchain_chunking_enabled(self, tmp_path):
        """Test that LangChain chunks large documents when enabled."""
        skill_dir = create_test_skill(tmp_path, large_doc=True)

        adaptor = get_adaptor("langchain")
        package_path = adaptor.package(
            skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
        )

        with open(package_path) as f:
            data = json.load(f)

        # Should have multiple chunks (more than 2 docs)
        assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs"

        # Check for chunking metadata
        chunked_docs = [doc for doc in data if doc["metadata"].get("is_chunked")]
        assert len(chunked_docs) > 0, "Should have chunked documents"

        # Verify chunk metadata structure
        for doc in chunked_docs:
            assert "chunk_index" in doc["metadata"]
            assert "total_chunks" in doc["metadata"]
            assert "chunk_id" in doc["metadata"]

    def test_chunking_preserves_small_docs(self, tmp_path):
        """Test that small documents are not chunked."""
        skill_dir = create_test_skill(tmp_path, large_doc=False)

        adaptor = get_adaptor("langchain")
        package_path = adaptor.package(
            skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
        )

        with open(package_path) as f:
            data = json.load(f)

        # Small docs should not be chunked
        assert len(data) == 2, "Small docs should not be chunked"

        for doc in data:
            assert "is_chunked" not in doc["metadata"]


class TestCodeBlockPreservation:
    """Test that code blocks are preserved during chunking."""

    def test_preserve_code_blocks(self, tmp_path):
        """Test that code blocks are not split during chunking."""
        skill_dir = tmp_path / "test_skill"
        skill_dir.mkdir()

        # Create document with code block
        content = """# Test

Some intro text that needs to be here for context.

```python
def example_function():
    # This code block should not be split
    x = 1
    y = 2
    z = 3
    return x + y + z
```

More content after code block.
""" + ("Lorem ipsum dolor sit amet. " * 1000)  # Make it large enough to force chunking

        (skill_dir / "SKILL.md").write_text(content)

        # Create references dir (required)
        (skill_dir / "references").mkdir()

        adaptor = get_adaptor("langchain")
        package_path = adaptor.package(
            skill_dir,
            tmp_path,
            enable_chunking=True,
            chunk_max_tokens=200,  # Small chunks to force splitting
            preserve_code_blocks=True,
        )

        with open(package_path) as f:
            data = json.load(f)

        # Find chunks with code block
        code_chunks = [doc for doc in data if "```python" in doc["page_content"]]

        # Code block should be in at least one chunk
        assert len(code_chunks) >= 1, "Code block should be preserved"

        # Code block should be complete (opening and closing backticks)
        for chunk in code_chunks:
            content = chunk["page_content"]
            if "```python" in content:
                # Should also have closing backticks
                assert content.count("```") >= 2, "Code block should be complete"


class TestAutoChunkingForRAGPlatforms:
    """Test that chunking is auto-enabled for RAG platforms."""

    @pytest.mark.parametrize(
        "platform",
        [
            "langchain",
            # Add others after they're updated:
            # 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
        ],
    )
    def test_rag_platforms_auto_chunk(self, platform, tmp_path):
        """Test that RAG platforms auto-enable chunking."""
        skill_dir = create_test_skill(tmp_path, large_doc=True)

        # Import package_skill function
        from skill_seekers.cli.package_skill import package_skill

        # Package with RAG platform (should auto-enable chunking)
        success, package_path = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target=platform,
            enable_chunking=False,  # Explicitly disabled, but should be auto-enabled
        )

        assert success, f"Packaging failed for {platform}"
        assert package_path.exists(), f"Package not created for {platform}"

        # Verify chunking occurred
        with open(package_path) as f:
            data = json.load(f)

        # Should have multiple documents/chunks
        if isinstance(data, list):
            assert len(data) > 2, f"{platform}: Should auto-chunk large docs"
        elif isinstance(data, dict) and "documents" in data:
            assert len(data["documents"]) > 2, f"{platform}: Should auto-chunk large docs"


class TestBaseAdaptorChunkingHelper:
    """Test the base adaptor's _maybe_chunk_content method."""

    def test_maybe_chunk_content_disabled(self):
        """Test that _maybe_chunk_content returns single chunk when disabled."""
        from skill_seekers.cli.adaptors.langchain import LangChainAdaptor

        adaptor = LangChainAdaptor()

        content = "Test content " * 1000  # Large content
        metadata = {"source": "test"}

        chunks = adaptor._maybe_chunk_content(content, metadata, enable_chunking=False)

        # Should return single chunk
        assert len(chunks) == 1
        assert chunks[0][0] == content
        assert chunks[0][1] == metadata

    def test_maybe_chunk_content_small_doc(self):
        """Test that small docs are not chunked even when enabled."""
        from skill_seekers.cli.adaptors.langchain import LangChainAdaptor

        adaptor = LangChainAdaptor()

        content = "Small test content"  # <512 tokens
        metadata = {"source": "test"}

        chunks = adaptor._maybe_chunk_content(
            content, metadata, enable_chunking=True, chunk_max_tokens=512
        )

        # Should return single chunk
        assert len(chunks) == 1

    def test_maybe_chunk_content_large_doc(self):
        """Test that large docs are chunked when enabled."""
        from skill_seekers.cli.adaptors.langchain import LangChainAdaptor

        adaptor = LangChainAdaptor()

        content = "Lorem ipsum dolor sit amet. " * 2000  # >512 tokens
        metadata = {"source": "test", "file": "test.md"}

        chunks = adaptor._maybe_chunk_content(
            content,
            metadata,
            enable_chunking=True,
            chunk_max_tokens=512,
            preserve_code_blocks=True,
            source_file="test.md",
        )

        # Should return multiple chunks
        assert len(chunks) > 1, f"Large doc should be chunked, got {len(chunks)} chunks"

        # Verify chunk metadata
        for chunk_text, chunk_meta in chunks:
            assert isinstance(chunk_text, str)
            assert isinstance(chunk_meta, dict)
            assert chunk_meta["is_chunked"]
            assert "chunk_index" in chunk_meta
            assert "chunk_id" in chunk_meta
            # Original metadata preserved
            assert chunk_meta["source"] == "test"
            assert chunk_meta["file"] == "test.md"


class TestChunkingCLIIntegration:
    """Test chunking via CLI arguments."""

    def test_chunk_flag(self, tmp_path):
        """Test --chunk-for-rag flag enables chunking."""
        from skill_seekers.cli.package_skill import package_skill

        skill_dir = create_test_skill(tmp_path, large_doc=True)

        success, package_path = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,  # --chunk-for-rag flag
            chunk_max_tokens=512,
            preserve_code_blocks=True,
        )

        assert success
        assert package_path.exists()

        with open(package_path) as f:
            data = json.load(f)

        # Should have chunked documents
        assert len(data) > 2

    def test_chunk_tokens_parameter(self, tmp_path):
        """Test --chunk-tokens parameter controls chunk size."""
        from skill_seekers.cli.package_skill import package_skill

        skill_dir = create_test_skill(tmp_path, large_doc=True)

        # Package with small chunk size
        success, package_path = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,
            chunk_max_tokens=256,  # Small chunks
            preserve_code_blocks=True,
        )

        assert success

        with open(package_path) as f:
            data_small = json.load(f)

        # Package with large chunk size
        success, package_path2 = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,
            chunk_max_tokens=1024,  # Large chunks
            preserve_code_blocks=True,
        )

        assert success

        with open(package_path2) as f:
            data_large = json.load(f)

        # Small chunk size should produce more chunks
        assert len(data_small) > len(data_large), (
            f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
        )

    def test_chunk_overlap_tokens_parameter(self, tmp_path):
        """Test --chunk-overlap-tokens controls RAGChunker overlap."""
        from skill_seekers.cli.package_skill import package_skill

        skill_dir = create_test_skill(tmp_path, large_doc=True)

        # Package with default overlap (50)
        success, package_path = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,
            chunk_max_tokens=256,
            chunk_overlap_tokens=50,
        )

        assert success
        assert package_path.exists()

        with open(package_path) as f:
            data_default = json.load(f)

        # Package with large overlap (128)
        success2, package_path2 = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,
            chunk_max_tokens=256,
            chunk_overlap_tokens=128,
        )

        assert success2
        assert package_path2.exists()

        with open(package_path2) as f:
            data_large_overlap = json.load(f)

        # Large overlap should produce more chunks (more overlap = more chunks)
        assert len(data_large_overlap) >= len(data_default), (
            f"Large overlap ({len(data_large_overlap)}) should produce >= chunks than default ({len(data_default)})"
        )

    def test_chunk_overlap_scales_with_chunk_size(self, tmp_path):
        """Test that overlap auto-scales when chunk_tokens is non-default but overlap is default."""
        from skill_seekers.cli.adaptors.base import (
            DEFAULT_CHUNK_TOKENS,
            DEFAULT_CHUNK_OVERLAP_TOKENS,
        )

        adaptor = get_adaptor("langchain")

        skill_dir = create_test_skill(tmp_path, large_doc=True)
        adaptor._build_skill_metadata(skill_dir)
        content = (skill_dir / "SKILL.md").read_text()

        # With default chunk size (512) and default overlap (50), overlap should be 50
        chunks_default = adaptor._maybe_chunk_content(
            content,
            {"source": "test"},
            enable_chunking=True,
            chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
            chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
        )

        # With large chunk size (1024) and default overlap (50),
        # overlap should auto-scale to max(50, 1024//10) = 102
        chunks_large = adaptor._maybe_chunk_content(
            content,
            {"source": "test"},
            enable_chunking=True,
            chunk_max_tokens=1024,
            chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
        )

        # Both should produce valid chunks
        assert len(chunks_default) > 1
        assert len(chunks_large) >= 1

    def test_preserve_code_blocks_flag(self, tmp_path):
        """Test --no-preserve-code-blocks parameter is accepted."""
        from skill_seekers.cli.package_skill import package_skill

        skill_dir = create_test_skill(tmp_path, large_doc=True)

        # Package with code block preservation disabled
        success, package_path = package_skill(
            skill_dir=skill_dir,
            open_folder_after=False,
            skip_quality_check=True,
            target="langchain",
            enable_chunking=True,
            chunk_max_tokens=256,
            preserve_code_blocks=False,
        )

        assert success
        assert package_path.exists()


if __name__ == "__main__":
    pytest.main([__file__, "-v"])