feat: Complete Phase 1b - Implement chunking in all 6 RAG adaptors

- Updated chroma.py: Parallel arrays pattern with chunking support
- Updated llama_index.py: Node format with chunking support
- Updated haystack.py: Document format with chunking support
- Updated faiss_helpers.py: Parallel arrays pattern with chunking support
- Updated weaviate.py: Object/properties format with chunking support
- Updated qdrant.py: Points/payload format with chunking support

All adaptors now use base._maybe_chunk_content() for consistent chunking behavior:
- Auto-chunks large documents (>512 tokens by default)
- Preserves code blocks during chunking
- Adds chunk metadata (chunk_index, total_chunks, is_chunked, chunk_id)
- Configurable via enable_chunking, chunk_max_tokens, preserve_code_blocks

Test results: 174/174 tests passing (6 skipped E2E tests)
- All 10 chunking integration tests pass
- All 66 RAG adaptor tests pass
- All platform-specific tests pass

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-08 01:15:10 +03:00
parent e9e3f5f4d7
commit 59e77f42b3
6 changed files with 267 additions and 102 deletions

View File

@@ -62,6 +62,8 @@ class ChromaAdaptor(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
Returns:
JSON string containing Chroma-compatible data
@@ -83,9 +85,21 @@ class ChromaAdaptor(SkillAdaptor):
"version": metadata.version,
}
documents.append(content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(content, doc_metadata))
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks to parallel arrays
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append(chunk_meta)
ids.append(self._generate_id(chunk_text, chunk_meta))
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -101,9 +115,21 @@ class ChromaAdaptor(SkillAdaptor):
"version": metadata.version,
}
documents.append(ref_content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(ref_content, doc_metadata))
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks to parallel arrays
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append(chunk_meta)
ids.append(self._generate_id(chunk_text, chunk_meta))
# Return Chroma-compatible format
return json.dumps(

View File

@@ -64,6 +64,8 @@ class FAISSHelpers(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters
Returns:
JSON string containing FAISS-compatible data
@@ -85,9 +87,21 @@ class FAISSHelpers(SkillAdaptor):
"version": metadata.version,
}
documents.append(content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(content, doc_metadata))
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks to parallel arrays
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append(chunk_meta)
ids.append(self._generate_id(chunk_text, chunk_meta))
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -102,9 +116,21 @@ class FAISSHelpers(SkillAdaptor):
"version": metadata.version,
}
documents.append(ref_content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(ref_content, doc_metadata))
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks to parallel arrays
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append(chunk_meta)
ids.append(self._generate_id(chunk_text, chunk_meta))
# FAISS configuration hints
config = {

View File

@@ -47,6 +47,8 @@ class HaystackAdaptor(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters
Returns:
JSON string containing array of Haystack Documents
@@ -58,38 +60,62 @@ class HaystackAdaptor(SkillAdaptor):
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
documents.append(
{
"content": content,
"meta": {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
},
}
doc_meta = {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
# Derive category from filename
category = ref_file.stem.replace("_", " ").lower()
documents.append(
{
"content": ref_content,
"meta": {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
},
}
doc_meta = {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)

View File

@@ -62,6 +62,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
Returns:
JSON string containing array of LlamaIndex Nodes
@@ -80,15 +82,26 @@ class LlamaIndexAdaptor(SkillAdaptor):
"type": "documentation",
"version": metadata.version,
}
nodes.append(
{
"text": content,
"metadata": node_metadata,
"id_": self._generate_node_id(content, node_metadata),
"embedding": None,
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
@@ -103,15 +116,25 @@ class LlamaIndexAdaptor(SkillAdaptor):
"version": metadata.version,
}
nodes.append(
{
"text": ref_content,
"metadata": node_metadata,
"id_": self._generate_node_id(ref_content, node_metadata),
"embedding": None,
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
# Return as formatted JSON
return json.dumps(nodes, indent=2, ensure_ascii=False)

View File

@@ -61,6 +61,8 @@ class QdrantAdaptor(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters
Returns:
JSON string containing Qdrant-compatible data
@@ -72,46 +74,86 @@ class QdrantAdaptor(SkillAdaptor):
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
point_id = self._generate_point_id(content, {
payload_meta = {
"source": metadata.name,
"file": "SKILL.md"
})
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
}
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": content,
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
}
})
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", "SKILL.md")
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
}
})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
category = ref_file.stem.replace("_", " ").lower()
point_id = self._generate_point_id(ref_content, {
payload_meta = {
"source": metadata.name,
"file": ref_file.name
})
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": ref_content,
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
})
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", ref_file.name)
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
}
})
# Qdrant configuration
config = {

View File

@@ -122,6 +122,8 @@ class WeaviateAdaptor(SkillAdaptor):
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters
Returns:
JSON string containing Weaviate objects and schema
@@ -141,20 +143,30 @@ class WeaviateAdaptor(SkillAdaptor):
"version": metadata.version,
}
objects.append(
{
"id": self._generate_uuid(content, obj_metadata),
"properties": {
"content": content,
"source": obj_metadata["source"],
"category": obj_metadata["category"],
"file": obj_metadata["file"],
"type": obj_metadata["type"],
"version": obj_metadata["version"],
},
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
@@ -169,20 +181,30 @@ class WeaviateAdaptor(SkillAdaptor):
"version": metadata.version,
}
objects.append(
{
"id": self._generate_uuid(ref_content, obj_metadata),
"properties": {
"content": ref_content,
"source": obj_metadata["source"],
"category": obj_metadata["category"],
"file": obj_metadata["file"],
"type": obj_metadata["type"],
"version": obj_metadata["version"],
},
}
# Chunk if enabled
chunks = self._maybe_chunk_content(
ref_content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
})
# Generate schema
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
schema = self._generate_schema(class_name)