feat: Complete Phase 1b - Implement chunking in all 6 RAG adaptors

- Updated chroma.py: Parallel arrays pattern with chunking support - Updated llama_index.py: Node format with chunking support - Updated haystack.py: Document format with chunking support - Updated faiss_helpers.py: Parallel arrays pattern with chunking support - Updated weaviate.py: Object/properties format with chunking support - Updated qdrant.py: Points/payload format with chunking support All adaptors now use base._maybe_chunk_content() for consistent chunking behavior: - Auto-chunks large documents (>512 tokens by default) - Preserves code blocks during chunking - Adds chunk metadata (chunk_index, total_chunks, is_chunked, chunk_id) - Configurable via enable_chunking, chunk_max_tokens, preserve_code_blocks Test results: 174/174 tests passing (6 skipped E2E tests) - All 10 chunking integration tests pass - All 66 RAG adaptor tests pass - All platform-specific tests pass Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-08 01:15:10 +03:00
parent e9e3f5f4d7
commit 59e77f42b3
6 changed files with 267 additions and 102 deletions
--- a/src/skill_seekers/cli/adaptors/chroma.py
+++ b/src/skill_seekers/cli/adaptors/chroma.py
@@ -62,6 +62,8 @@ class ChromaAdaptor(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)

        Returns:
            JSON string containing Chroma-compatible data
@@ -83,9 +85,21 @@ class ChromaAdaptor(SkillAdaptor):
                    "version": metadata.version,
                }

-                documents.append(content)
-                metadatas.append(doc_metadata)
-                ids.append(self._generate_id(content, doc_metadata))
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
+                )
+
+                # Add all chunks to parallel arrays
+                for chunk_text, chunk_meta in chunks:
+                    documents.append(chunk_text)
+                    metadatas.append(chunk_meta)
+                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -101,9 +115,21 @@ class ChromaAdaptor(SkillAdaptor):
                    "version": metadata.version,
                }

-                documents.append(ref_content)
-                metadatas.append(doc_metadata)
-                ids.append(self._generate_id(ref_content, doc_metadata))
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
+                )
+
+                # Add all chunks to parallel arrays
+                for chunk_text, chunk_meta in chunks:
+                    documents.append(chunk_text)
+                    metadatas.append(chunk_meta)
+                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # Return Chroma-compatible format
        return json.dumps(
--- a/src/skill_seekers/cli/adaptors/faiss_helpers.py
+++ b/src/skill_seekers/cli/adaptors/faiss_helpers.py
@@ -64,6 +64,8 @@ class FAISSHelpers(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters

        Returns:
            JSON string containing FAISS-compatible data
@@ -85,9 +87,21 @@ class FAISSHelpers(SkillAdaptor):
                    "version": metadata.version,
                }

-                documents.append(content)
-                metadatas.append(doc_metadata)
-                ids.append(self._generate_id(content, doc_metadata))
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
+                )
+
+                # Add all chunks to parallel arrays
+                for chunk_text, chunk_meta in chunks:
+                    documents.append(chunk_text)
+                    metadatas.append(chunk_meta)
+                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -102,9 +116,21 @@ class FAISSHelpers(SkillAdaptor):
                    "version": metadata.version,
                }

-                documents.append(ref_content)
-                metadatas.append(doc_metadata)
-                ids.append(self._generate_id(ref_content, doc_metadata))
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
+                )
+
+                # Add all chunks to parallel arrays
+                for chunk_text, chunk_meta in chunks:
+                    documents.append(chunk_text)
+                    metadatas.append(chunk_meta)
+                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # FAISS configuration hints
        config = {
--- a/src/skill_seekers/cli/adaptors/haystack.py
+++ b/src/skill_seekers/cli/adaptors/haystack.py
@@ -47,6 +47,8 @@ class HaystackAdaptor(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters

        Returns:
            JSON string containing array of Haystack Documents
@@ -58,38 +60,62 @@ class HaystackAdaptor(SkillAdaptor):
        if skill_md_path.exists():
            content = self._read_existing_content(skill_dir)
            if content.strip():
-                documents.append(
-                    {
-                        "content": content,
-                        "meta": {
-                            "source": metadata.name,
-                            "category": "overview",
-                            "file": "SKILL.md",
-                            "type": "documentation",
-                            "version": metadata.version,
-                        },
-                    }
+                doc_meta = {
+                    "source": metadata.name,
+                    "category": "overview",
+                    "file": "SKILL.md",
+                    "type": "documentation",
+                    "version": metadata.version,
+                }
+
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    doc_meta,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
                )

+                # Add all chunks as documents
+                for chunk_text, chunk_meta in chunks:
+                    documents.append({
+                        "content": chunk_text,
+                        "meta": chunk_meta,
+                    })
+
        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
            if ref_content.strip():
                # Derive category from filename
                category = ref_file.stem.replace("_", " ").lower()

-                documents.append(
-                    {
-                        "content": ref_content,
-                        "meta": {
-                            "source": metadata.name,
-                            "category": category,
-                            "file": ref_file.name,
-                            "type": "reference",
-                            "version": metadata.version,
-                        },
-                    }
+                doc_meta = {
+                    "source": metadata.name,
+                    "category": category,
+                    "file": ref_file.name,
+                    "type": "reference",
+                    "version": metadata.version,
+                }
+
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    doc_meta,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
                )

+                # Add all chunks as documents
+                for chunk_text, chunk_meta in chunks:
+                    documents.append({
+                        "content": chunk_text,
+                        "meta": chunk_meta,
+                    })
+
        # Return as formatted JSON
        return json.dumps(documents, indent=2, ensure_ascii=False)

--- a/src/skill_seekers/cli/adaptors/llama_index.py
+++ b/src/skill_seekers/cli/adaptors/llama_index.py
@@ -62,6 +62,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)

        Returns:
            JSON string containing array of LlamaIndex Nodes
@@ -80,15 +82,26 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    "type": "documentation",
                    "version": metadata.version,
                }
-                nodes.append(
-                    {
-                        "text": content,
-                        "metadata": node_metadata,
-                        "id_": self._generate_node_id(content, node_metadata),
-                        "embedding": None,
-                    }
+
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    node_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
                )

+                # Add all chunks as nodes
+                for chunk_text, chunk_meta in chunks:
+                    nodes.append({
+                        "text": chunk_text,
+                        "metadata": chunk_meta,
+                        "id_": self._generate_node_id(chunk_text, chunk_meta),
+                        "embedding": None,
+                    })
+
        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
            if ref_content.strip():
@@ -103,15 +116,25 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    "version": metadata.version,
                }

-                nodes.append(
-                    {
-                        "text": ref_content,
-                        "metadata": node_metadata,
-                        "id_": self._generate_node_id(ref_content, node_metadata),
-                        "embedding": None,
-                    }
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    node_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
                )

+                # Add all chunks as nodes
+                for chunk_text, chunk_meta in chunks:
+                    nodes.append({
+                        "text": chunk_text,
+                        "metadata": chunk_meta,
+                        "id_": self._generate_node_id(chunk_text, chunk_meta),
+                        "embedding": None,
+                    })
+
        # Return as formatted JSON
        return json.dumps(nodes, indent=2, ensure_ascii=False)

--- a/src/skill_seekers/cli/adaptors/qdrant.py
+++ b/src/skill_seekers/cli/adaptors/qdrant.py
@@ -61,6 +61,8 @@ class QdrantAdaptor(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters

        Returns:
            JSON string containing Qdrant-compatible data
@@ -72,46 +74,86 @@ class QdrantAdaptor(SkillAdaptor):
        if skill_md_path.exists():
            content = self._read_existing_content(skill_dir)
            if content.strip():
-                point_id = self._generate_point_id(content, {
+                payload_meta = {
                    "source": metadata.name,
-                    "file": "SKILL.md"
-                })
+                    "category": "overview",
+                    "file": "SKILL.md",
+                    "type": "documentation",
+                    "version": metadata.version,
+                }

-                points.append({
-                    "id": point_id,
-                    "vector": None,  # User will generate embeddings
-                    "payload": {
-                        "content": content,
-                        "source": metadata.name,
-                        "category": "overview",
-                        "file": "SKILL.md",
-                        "type": "documentation",
-                        "version": metadata.version,
-                    }
-                })
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    payload_meta,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
+                )
+
+                # Add all chunks as points
+                for chunk_text, chunk_meta in chunks:
+                    point_id = self._generate_point_id(chunk_text, {
+                        "source": chunk_meta.get("source", metadata.name),
+                        "file": chunk_meta.get("file", "SKILL.md")
+                    })
+
+                    points.append({
+                        "id": point_id,
+                        "vector": None,  # User will generate embeddings
+                        "payload": {
+                            "content": chunk_text,
+                            "source": chunk_meta.get("source", metadata.name),
+                            "category": chunk_meta.get("category", "overview"),
+                            "file": chunk_meta.get("file", "SKILL.md"),
+                            "type": chunk_meta.get("type", "documentation"),
+                            "version": chunk_meta.get("version", metadata.version),
+                        }
+                    })

        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
            if ref_content.strip():
                category = ref_file.stem.replace("_", " ").lower()

-                point_id = self._generate_point_id(ref_content, {
+                payload_meta = {
                    "source": metadata.name,
-                    "file": ref_file.name
-                })
+                    "category": category,
+                    "file": ref_file.name,
+                    "type": "reference",
+                    "version": metadata.version,
+                }

-                points.append({
-                    "id": point_id,
-                    "vector": None,  # User will generate embeddings
-                    "payload": {
-                        "content": ref_content,
-                        "source": metadata.name,
-                        "category": category,
-                        "file": ref_file.name,
-                        "type": "reference",
-                        "version": metadata.version,
-                    }
-                })
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    payload_meta,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
+                )
+
+                # Add all chunks as points
+                for chunk_text, chunk_meta in chunks:
+                    point_id = self._generate_point_id(chunk_text, {
+                        "source": chunk_meta.get("source", metadata.name),
+                        "file": chunk_meta.get("file", ref_file.name)
+                    })
+
+                    points.append({
+                        "id": point_id,
+                        "vector": None,  # User will generate embeddings
+                        "payload": {
+                            "content": chunk_text,
+                            "source": chunk_meta.get("source", metadata.name),
+                            "category": chunk_meta.get("category", category),
+                            "file": chunk_meta.get("file", ref_file.name),
+                            "type": chunk_meta.get("type", "reference"),
+                            "version": chunk_meta.get("version", metadata.version),
+                        }
+                    })

        # Qdrant configuration
        config = {
--- a/src/skill_seekers/cli/adaptors/weaviate.py
+++ b/src/skill_seekers/cli/adaptors/weaviate.py
@@ -122,6 +122,8 @@ class WeaviateAdaptor(SkillAdaptor):
        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters

        Returns:
            JSON string containing Weaviate objects and schema
@@ -141,20 +143,30 @@ class WeaviateAdaptor(SkillAdaptor):
                    "version": metadata.version,
                }

-                objects.append(
-                    {
-                        "id": self._generate_uuid(content, obj_metadata),
-                        "properties": {
-                            "content": content,
-                            "source": obj_metadata["source"],
-                            "category": obj_metadata["category"],
-                            "file": obj_metadata["file"],
-                            "type": obj_metadata["type"],
-                            "version": obj_metadata["version"],
-                        },
-                    }
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    content,
+                    obj_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file="SKILL.md"
                )

+                # Add all chunks as objects
+                for chunk_text, chunk_meta in chunks:
+                    objects.append({
+                        "id": self._generate_uuid(chunk_text, chunk_meta),
+                        "properties": {
+                            "content": chunk_text,
+                            "source": chunk_meta.get("source", metadata.name),
+                            "category": chunk_meta.get("category", "overview"),
+                            "file": chunk_meta.get("file", "SKILL.md"),
+                            "type": chunk_meta.get("type", "documentation"),
+                            "version": chunk_meta.get("version", metadata.version),
+                        },
+                    })
+
        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
            if ref_content.strip():
@@ -169,20 +181,30 @@ class WeaviateAdaptor(SkillAdaptor):
                    "version": metadata.version,
                }

-                objects.append(
-                    {
-                        "id": self._generate_uuid(ref_content, obj_metadata),
-                        "properties": {
-                            "content": ref_content,
-                            "source": obj_metadata["source"],
-                            "category": obj_metadata["category"],
-                            "file": obj_metadata["file"],
-                            "type": obj_metadata["type"],
-                            "version": obj_metadata["version"],
-                        },
-                    }
+                # Chunk if enabled
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    obj_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
+                    preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
+                    source_file=ref_file.name
                )

+                # Add all chunks as objects
+                for chunk_text, chunk_meta in chunks:
+                    objects.append({
+                        "id": self._generate_uuid(chunk_text, chunk_meta),
+                        "properties": {
+                            "content": chunk_text,
+                            "source": chunk_meta.get("source", metadata.name),
+                            "category": chunk_meta.get("category", category),
+                            "file": chunk_meta.get("file", ref_file.name),
+                            "type": chunk_meta.get("type", "reference"),
+                            "version": chunk_meta.get("version", metadata.version),
+                        },
+                    })
+
        # Generate schema
        class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
        schema = self._generate_schema(class_name)