feat: Complete Phase 1b - Implement chunking in all 6 RAG adaptors
- Updated chroma.py: Parallel arrays pattern with chunking support - Updated llama_index.py: Node format with chunking support - Updated haystack.py: Document format with chunking support - Updated faiss_helpers.py: Parallel arrays pattern with chunking support - Updated weaviate.py: Object/properties format with chunking support - Updated qdrant.py: Points/payload format with chunking support All adaptors now use base._maybe_chunk_content() for consistent chunking behavior: - Auto-chunks large documents (>512 tokens by default) - Preserves code blocks during chunking - Adds chunk metadata (chunk_index, total_chunks, is_chunked, chunk_id) - Configurable via enable_chunking, chunk_max_tokens, preserve_code_blocks Test results: 174/174 tests passing (6 skipped E2E tests) - All 10 chunking integration tests pass - All 66 RAG adaptor tests pass - All platform-specific tests pass Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -62,6 +62,8 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
|
||||
|
||||
Returns:
|
||||
JSON string containing Chroma-compatible data
|
||||
@@ -83,9 +85,21 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(content, doc_metadata))
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append(chunk_meta)
|
||||
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -101,9 +115,21 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(ref_content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append(chunk_meta)
|
||||
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||
|
||||
# Return Chroma-compatible format
|
||||
return json.dumps(
|
||||
|
||||
@@ -64,6 +64,8 @@ class FAISSHelpers(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing FAISS-compatible data
|
||||
@@ -85,9 +87,21 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(content, doc_metadata))
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append(chunk_meta)
|
||||
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -102,9 +116,21 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(ref_content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append(chunk_meta)
|
||||
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||
|
||||
# FAISS configuration hints
|
||||
config = {
|
||||
|
||||
@@ -47,6 +47,8 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing array of Haystack Documents
|
||||
@@ -58,38 +60,62 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
documents.append(
|
||||
{
|
||||
"content": content,
|
||||
"meta": {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
},
|
||||
}
|
||||
doc_meta = {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
# Derive category from filename
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
documents.append(
|
||||
{
|
||||
"content": ref_content,
|
||||
"meta": {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
},
|
||||
}
|
||||
doc_meta = {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
@@ -62,6 +62,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
|
||||
|
||||
Returns:
|
||||
JSON string containing array of LlamaIndex Nodes
|
||||
@@ -80,15 +82,26 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
nodes.append(
|
||||
{
|
||||
"text": content,
|
||||
"metadata": node_metadata,
|
||||
"id_": self._generate_node_id(content, node_metadata),
|
||||
"embedding": None,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
@@ -103,15 +116,25 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
nodes.append(
|
||||
{
|
||||
"text": ref_content,
|
||||
"metadata": node_metadata,
|
||||
"id_": self._generate_node_id(ref_content, node_metadata),
|
||||
"embedding": None,
|
||||
}
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(nodes, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
@@ -61,6 +61,8 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing Qdrant-compatible data
|
||||
@@ -72,46 +74,86 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
point_id = self._generate_point_id(content, {
|
||||
payload_meta = {
|
||||
"source": metadata.name,
|
||||
"file": "SKILL.md"
|
||||
})
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": content,
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
})
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", "SKILL.md")
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
}
|
||||
})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
point_id = self._generate_point_id(ref_content, {
|
||||
payload_meta = {
|
||||
"source": metadata.name,
|
||||
"file": ref_file.name
|
||||
})
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": ref_content,
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
}
|
||||
})
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", ref_file.name)
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
}
|
||||
})
|
||||
|
||||
# Qdrant configuration
|
||||
config = {
|
||||
|
||||
@@ -122,6 +122,8 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing Weaviate objects and schema
|
||||
@@ -141,20 +143,30 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(content, obj_metadata),
|
||||
"properties": {
|
||||
"content": content,
|
||||
"source": obj_metadata["source"],
|
||||
"category": obj_metadata["category"],
|
||||
"file": obj_metadata["file"],
|
||||
"type": obj_metadata["type"],
|
||||
"version": obj_metadata["version"],
|
||||
},
|
||||
}
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
@@ -169,20 +181,30 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(ref_content, obj_metadata),
|
||||
"properties": {
|
||||
"content": ref_content,
|
||||
"source": obj_metadata["source"],
|
||||
"category": obj_metadata["category"],
|
||||
"file": obj_metadata["file"],
|
||||
"type": obj_metadata["type"],
|
||||
"version": obj_metadata["version"],
|
||||
},
|
||||
}
|
||||
# Chunk if enabled
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
|
||||
# Generate schema
|
||||
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
|
||||
schema = self._generate_schema(class_name)
|
||||
|
||||
Reference in New Issue
Block a user