feat: Complete Phase 1b - Implement chunking in all 6 RAG adaptors
- Updated chroma.py: Parallel arrays pattern with chunking support - Updated llama_index.py: Node format with chunking support - Updated haystack.py: Document format with chunking support - Updated faiss_helpers.py: Parallel arrays pattern with chunking support - Updated weaviate.py: Object/properties format with chunking support - Updated qdrant.py: Points/payload format with chunking support All adaptors now use base._maybe_chunk_content() for consistent chunking behavior: - Auto-chunks large documents (>512 tokens by default) - Preserves code blocks during chunking - Adds chunk metadata (chunk_index, total_chunks, is_chunked, chunk_id) - Configurable via enable_chunking, chunk_max_tokens, preserve_code_blocks Test results: 174/174 tests passing (6 skipped E2E tests) - All 10 chunking integration tests pass - All 66 RAG adaptor tests pass - All platform-specific tests pass Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -62,6 +62,8 @@ class ChromaAdaptor(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing Chroma-compatible data
|
JSON string containing Chroma-compatible data
|
||||||
@@ -83,9 +85,21 @@ class ChromaAdaptor(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.append(content)
|
# Chunk if enabled
|
||||||
metadatas.append(doc_metadata)
|
chunks = self._maybe_chunk_content(
|
||||||
ids.append(self._generate_id(content, doc_metadata))
|
content,
|
||||||
|
doc_metadata,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file="SKILL.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks to parallel arrays
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append(chunk_text)
|
||||||
|
metadatas.append(chunk_meta)
|
||||||
|
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||||
|
|
||||||
# Convert all reference files using base helper method
|
# Convert all reference files using base helper method
|
||||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||||
@@ -101,9 +115,21 @@ class ChromaAdaptor(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.append(ref_content)
|
# Chunk if enabled
|
||||||
metadatas.append(doc_metadata)
|
chunks = self._maybe_chunk_content(
|
||||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
ref_content,
|
||||||
|
doc_metadata,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file=ref_file.name
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks to parallel arrays
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append(chunk_text)
|
||||||
|
metadatas.append(chunk_meta)
|
||||||
|
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||||
|
|
||||||
# Return Chroma-compatible format
|
# Return Chroma-compatible format
|
||||||
return json.dumps(
|
return json.dumps(
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ class FAISSHelpers(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing FAISS-compatible data
|
JSON string containing FAISS-compatible data
|
||||||
@@ -85,9 +87,21 @@ class FAISSHelpers(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.append(content)
|
# Chunk if enabled
|
||||||
metadatas.append(doc_metadata)
|
chunks = self._maybe_chunk_content(
|
||||||
ids.append(self._generate_id(content, doc_metadata))
|
content,
|
||||||
|
doc_metadata,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file="SKILL.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks to parallel arrays
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append(chunk_text)
|
||||||
|
metadatas.append(chunk_meta)
|
||||||
|
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||||
|
|
||||||
# Convert all reference files using base helper method
|
# Convert all reference files using base helper method
|
||||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||||
@@ -102,9 +116,21 @@ class FAISSHelpers(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
documents.append(ref_content)
|
# Chunk if enabled
|
||||||
metadatas.append(doc_metadata)
|
chunks = self._maybe_chunk_content(
|
||||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
ref_content,
|
||||||
|
doc_metadata,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file=ref_file.name
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks to parallel arrays
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append(chunk_text)
|
||||||
|
metadatas.append(chunk_meta)
|
||||||
|
ids.append(self._generate_id(chunk_text, chunk_meta))
|
||||||
|
|
||||||
# FAISS configuration hints
|
# FAISS configuration hints
|
||||||
config = {
|
config = {
|
||||||
|
|||||||
@@ -47,6 +47,8 @@ class HaystackAdaptor(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing array of Haystack Documents
|
JSON string containing array of Haystack Documents
|
||||||
@@ -58,38 +60,62 @@ class HaystackAdaptor(SkillAdaptor):
|
|||||||
if skill_md_path.exists():
|
if skill_md_path.exists():
|
||||||
content = self._read_existing_content(skill_dir)
|
content = self._read_existing_content(skill_dir)
|
||||||
if content.strip():
|
if content.strip():
|
||||||
documents.append(
|
doc_meta = {
|
||||||
{
|
|
||||||
"content": content,
|
|
||||||
"meta": {
|
|
||||||
"source": metadata.name,
|
"source": metadata.name,
|
||||||
"category": "overview",
|
"category": "overview",
|
||||||
"file": "SKILL.md",
|
"file": "SKILL.md",
|
||||||
"type": "documentation",
|
"type": "documentation",
|
||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Chunk if enabled
|
||||||
|
chunks = self._maybe_chunk_content(
|
||||||
|
content,
|
||||||
|
doc_meta,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file="SKILL.md"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as documents
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append({
|
||||||
|
"content": chunk_text,
|
||||||
|
"meta": chunk_meta,
|
||||||
|
})
|
||||||
|
|
||||||
# Convert all reference files using base helper method
|
# Convert all reference files using base helper method
|
||||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||||
if ref_content.strip():
|
if ref_content.strip():
|
||||||
# Derive category from filename
|
# Derive category from filename
|
||||||
category = ref_file.stem.replace("_", " ").lower()
|
category = ref_file.stem.replace("_", " ").lower()
|
||||||
|
|
||||||
documents.append(
|
doc_meta = {
|
||||||
{
|
|
||||||
"content": ref_content,
|
|
||||||
"meta": {
|
|
||||||
"source": metadata.name,
|
"source": metadata.name,
|
||||||
"category": category,
|
"category": category,
|
||||||
"file": ref_file.name,
|
"file": ref_file.name,
|
||||||
"type": "reference",
|
"type": "reference",
|
||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Chunk if enabled
|
||||||
|
chunks = self._maybe_chunk_content(
|
||||||
|
ref_content,
|
||||||
|
doc_meta,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file=ref_file.name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as documents
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
documents.append({
|
||||||
|
"content": chunk_text,
|
||||||
|
"meta": chunk_meta,
|
||||||
|
})
|
||||||
|
|
||||||
# Return as formatted JSON
|
# Return as formatted JSON
|
||||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|||||||
@@ -62,6 +62,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing array of LlamaIndex Nodes
|
JSON string containing array of LlamaIndex Nodes
|
||||||
@@ -80,15 +82,26 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
|||||||
"type": "documentation",
|
"type": "documentation",
|
||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
nodes.append(
|
|
||||||
{
|
# Chunk if enabled
|
||||||
"text": content,
|
chunks = self._maybe_chunk_content(
|
||||||
"metadata": node_metadata,
|
content,
|
||||||
"id_": self._generate_node_id(content, node_metadata),
|
node_metadata,
|
||||||
"embedding": None,
|
enable_chunking=enable_chunking,
|
||||||
}
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file="SKILL.md"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as nodes
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
nodes.append({
|
||||||
|
"text": chunk_text,
|
||||||
|
"metadata": chunk_meta,
|
||||||
|
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||||
|
"embedding": None,
|
||||||
|
})
|
||||||
|
|
||||||
# Convert all reference files using base helper method
|
# Convert all reference files using base helper method
|
||||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||||
if ref_content.strip():
|
if ref_content.strip():
|
||||||
@@ -103,15 +116,25 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
nodes.append(
|
# Chunk if enabled
|
||||||
{
|
chunks = self._maybe_chunk_content(
|
||||||
"text": ref_content,
|
ref_content,
|
||||||
"metadata": node_metadata,
|
node_metadata,
|
||||||
"id_": self._generate_node_id(ref_content, node_metadata),
|
enable_chunking=enable_chunking,
|
||||||
"embedding": None,
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
}
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file=ref_file.name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as nodes
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
nodes.append({
|
||||||
|
"text": chunk_text,
|
||||||
|
"metadata": chunk_meta,
|
||||||
|
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||||
|
"embedding": None,
|
||||||
|
})
|
||||||
|
|
||||||
# Return as formatted JSON
|
# Return as formatted JSON
|
||||||
return json.dumps(nodes, indent=2, ensure_ascii=False)
|
return json.dumps(nodes, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|||||||
@@ -61,6 +61,8 @@ class QdrantAdaptor(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing Qdrant-compatible data
|
JSON string containing Qdrant-compatible data
|
||||||
@@ -72,21 +74,41 @@ class QdrantAdaptor(SkillAdaptor):
|
|||||||
if skill_md_path.exists():
|
if skill_md_path.exists():
|
||||||
content = self._read_existing_content(skill_dir)
|
content = self._read_existing_content(skill_dir)
|
||||||
if content.strip():
|
if content.strip():
|
||||||
point_id = self._generate_point_id(content, {
|
payload_meta = {
|
||||||
"source": metadata.name,
|
"source": metadata.name,
|
||||||
"file": "SKILL.md"
|
"category": "overview",
|
||||||
|
"file": "SKILL.md",
|
||||||
|
"type": "documentation",
|
||||||
|
"version": metadata.version,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Chunk if enabled
|
||||||
|
chunks = self._maybe_chunk_content(
|
||||||
|
content,
|
||||||
|
payload_meta,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file="SKILL.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks as points
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
point_id = self._generate_point_id(chunk_text, {
|
||||||
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
|
"file": chunk_meta.get("file", "SKILL.md")
|
||||||
})
|
})
|
||||||
|
|
||||||
points.append({
|
points.append({
|
||||||
"id": point_id,
|
"id": point_id,
|
||||||
"vector": None, # User will generate embeddings
|
"vector": None, # User will generate embeddings
|
||||||
"payload": {
|
"payload": {
|
||||||
"content": content,
|
"content": chunk_text,
|
||||||
"source": metadata.name,
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
"category": "overview",
|
"category": chunk_meta.get("category", "overview"),
|
||||||
"file": "SKILL.md",
|
"file": chunk_meta.get("file", "SKILL.md"),
|
||||||
"type": "documentation",
|
"type": chunk_meta.get("type", "documentation"),
|
||||||
"version": metadata.version,
|
"version": chunk_meta.get("version", metadata.version),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -95,21 +117,41 @@ class QdrantAdaptor(SkillAdaptor):
|
|||||||
if ref_content.strip():
|
if ref_content.strip():
|
||||||
category = ref_file.stem.replace("_", " ").lower()
|
category = ref_file.stem.replace("_", " ").lower()
|
||||||
|
|
||||||
point_id = self._generate_point_id(ref_content, {
|
payload_meta = {
|
||||||
"source": metadata.name,
|
"source": metadata.name,
|
||||||
"file": ref_file.name
|
"category": category,
|
||||||
|
"file": ref_file.name,
|
||||||
|
"type": "reference",
|
||||||
|
"version": metadata.version,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Chunk if enabled
|
||||||
|
chunks = self._maybe_chunk_content(
|
||||||
|
ref_content,
|
||||||
|
payload_meta,
|
||||||
|
enable_chunking=enable_chunking,
|
||||||
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
|
source_file=ref_file.name
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add all chunks as points
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
point_id = self._generate_point_id(chunk_text, {
|
||||||
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
|
"file": chunk_meta.get("file", ref_file.name)
|
||||||
})
|
})
|
||||||
|
|
||||||
points.append({
|
points.append({
|
||||||
"id": point_id,
|
"id": point_id,
|
||||||
"vector": None, # User will generate embeddings
|
"vector": None, # User will generate embeddings
|
||||||
"payload": {
|
"payload": {
|
||||||
"content": ref_content,
|
"content": chunk_text,
|
||||||
"source": metadata.name,
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
"category": category,
|
"category": chunk_meta.get("category", category),
|
||||||
"file": ref_file.name,
|
"file": chunk_meta.get("file", ref_file.name),
|
||||||
"type": "reference",
|
"type": chunk_meta.get("type", "reference"),
|
||||||
"version": metadata.version,
|
"version": chunk_meta.get("version", metadata.version),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -122,6 +122,8 @@ class WeaviateAdaptor(SkillAdaptor):
|
|||||||
Args:
|
Args:
|
||||||
skill_dir: Path to skill directory
|
skill_dir: Path to skill directory
|
||||||
metadata: Skill metadata
|
metadata: Skill metadata
|
||||||
|
enable_chunking: Enable intelligent chunking for large documents
|
||||||
|
**kwargs: Additional chunking parameters
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
JSON string containing Weaviate objects and schema
|
JSON string containing Weaviate objects and schema
|
||||||
@@ -141,20 +143,30 @@ class WeaviateAdaptor(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
objects.append(
|
# Chunk if enabled
|
||||||
{
|
chunks = self._maybe_chunk_content(
|
||||||
"id": self._generate_uuid(content, obj_metadata),
|
content,
|
||||||
"properties": {
|
obj_metadata,
|
||||||
"content": content,
|
enable_chunking=enable_chunking,
|
||||||
"source": obj_metadata["source"],
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
"category": obj_metadata["category"],
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
"file": obj_metadata["file"],
|
source_file="SKILL.md"
|
||||||
"type": obj_metadata["type"],
|
|
||||||
"version": obj_metadata["version"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as objects
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
objects.append({
|
||||||
|
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||||
|
"properties": {
|
||||||
|
"content": chunk_text,
|
||||||
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
|
"category": chunk_meta.get("category", "overview"),
|
||||||
|
"file": chunk_meta.get("file", "SKILL.md"),
|
||||||
|
"type": chunk_meta.get("type", "documentation"),
|
||||||
|
"version": chunk_meta.get("version", metadata.version),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
# Convert all reference files using base helper method
|
# Convert all reference files using base helper method
|
||||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||||
if ref_content.strip():
|
if ref_content.strip():
|
||||||
@@ -169,20 +181,30 @@ class WeaviateAdaptor(SkillAdaptor):
|
|||||||
"version": metadata.version,
|
"version": metadata.version,
|
||||||
}
|
}
|
||||||
|
|
||||||
objects.append(
|
# Chunk if enabled
|
||||||
{
|
chunks = self._maybe_chunk_content(
|
||||||
"id": self._generate_uuid(ref_content, obj_metadata),
|
ref_content,
|
||||||
"properties": {
|
obj_metadata,
|
||||||
"content": ref_content,
|
enable_chunking=enable_chunking,
|
||||||
"source": obj_metadata["source"],
|
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||||
"category": obj_metadata["category"],
|
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||||
"file": obj_metadata["file"],
|
source_file=ref_file.name
|
||||||
"type": obj_metadata["type"],
|
|
||||||
"version": obj_metadata["version"],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add all chunks as objects
|
||||||
|
for chunk_text, chunk_meta in chunks:
|
||||||
|
objects.append({
|
||||||
|
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||||
|
"properties": {
|
||||||
|
"content": chunk_text,
|
||||||
|
"source": chunk_meta.get("source", metadata.name),
|
||||||
|
"category": chunk_meta.get("category", category),
|
||||||
|
"file": chunk_meta.get("file", ref_file.name),
|
||||||
|
"type": chunk_meta.get("type", "reference"),
|
||||||
|
"version": chunk_meta.get("version", metadata.version),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
# Generate schema
|
# Generate schema
|
||||||
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
|
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
|
||||||
schema = self._generate_schema(class_name)
|
schema = self._generate_schema(class_name)
|
||||||
|
|||||||
Reference in New Issue
Block a user