diff --git a/src/skill_seekers/cli/adaptors/chroma.py b/src/skill_seekers/cli/adaptors/chroma.py index 1ce3bf0..e8b1e3b 100644 --- a/src/skill_seekers/cli/adaptors/chroma.py +++ b/src/skill_seekers/cli/adaptors/chroma.py @@ -62,6 +62,8 @@ class ChromaAdaptor(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks) Returns: JSON string containing Chroma-compatible data @@ -83,9 +85,21 @@ class ChromaAdaptor(SkillAdaptor): "version": metadata.version, } - documents.append(content) - metadatas.append(doc_metadata) - ids.append(self._generate_id(content, doc_metadata)) + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + doc_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" + ) + + # Add all chunks to parallel arrays + for chunk_text, chunk_meta in chunks: + documents.append(chunk_text) + metadatas.append(chunk_meta) + ids.append(self._generate_id(chunk_text, chunk_meta)) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -101,9 +115,21 @@ class ChromaAdaptor(SkillAdaptor): "version": metadata.version, } - documents.append(ref_content) - metadatas.append(doc_metadata) - ids.append(self._generate_id(ref_content, doc_metadata)) + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + doc_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name + ) + + # Add all chunks to parallel arrays + for chunk_text, chunk_meta in chunks: + documents.append(chunk_text) + metadatas.append(chunk_meta) + ids.append(self._generate_id(chunk_text, chunk_meta)) # Return Chroma-compatible format return json.dumps( diff --git a/src/skill_seekers/cli/adaptors/faiss_helpers.py b/src/skill_seekers/cli/adaptors/faiss_helpers.py index d09eedf..b8b5fa1 100644 --- a/src/skill_seekers/cli/adaptors/faiss_helpers.py +++ b/src/skill_seekers/cli/adaptors/faiss_helpers.py @@ -64,6 +64,8 @@ class FAISSHelpers(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters Returns: JSON string containing FAISS-compatible data @@ -85,9 +87,21 @@ class FAISSHelpers(SkillAdaptor): "version": metadata.version, } - documents.append(content) - metadatas.append(doc_metadata) - ids.append(self._generate_id(content, doc_metadata)) + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + doc_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" + ) + + # Add all chunks to parallel arrays + for chunk_text, chunk_meta in chunks: + documents.append(chunk_text) + metadatas.append(chunk_meta) + ids.append(self._generate_id(chunk_text, chunk_meta)) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -102,9 +116,21 @@ class FAISSHelpers(SkillAdaptor): "version": metadata.version, } - documents.append(ref_content) - metadatas.append(doc_metadata) - ids.append(self._generate_id(ref_content, doc_metadata)) + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + doc_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name + ) + + # Add all chunks to parallel arrays + for chunk_text, chunk_meta in chunks: + documents.append(chunk_text) + metadatas.append(chunk_meta) + ids.append(self._generate_id(chunk_text, chunk_meta)) # FAISS configuration hints config = { diff --git a/src/skill_seekers/cli/adaptors/haystack.py b/src/skill_seekers/cli/adaptors/haystack.py index 1faffe2..1a69f79 100644 --- a/src/skill_seekers/cli/adaptors/haystack.py +++ b/src/skill_seekers/cli/adaptors/haystack.py @@ -47,6 +47,8 @@ class HaystackAdaptor(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters Returns: JSON string containing array of Haystack Documents @@ -58,38 +60,62 @@ class HaystackAdaptor(SkillAdaptor): if skill_md_path.exists(): content = self._read_existing_content(skill_dir) if content.strip(): - documents.append( - { - "content": content, - "meta": { - "source": metadata.name, - "category": "overview", - "file": "SKILL.md", - "type": "documentation", - "version": metadata.version, - }, - } + doc_meta = { + "source": metadata.name, + "category": "overview", + "file": "SKILL.md", + "type": "documentation", + "version": metadata.version, + } + + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + doc_meta, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" ) + # Add all chunks as documents + for chunk_text, chunk_meta in chunks: + documents.append({ + "content": chunk_text, + "meta": chunk_meta, + }) + # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): if ref_content.strip(): # Derive category from filename category = ref_file.stem.replace("_", " ").lower() - documents.append( - { - "content": ref_content, - "meta": { - "source": metadata.name, - "category": category, - "file": ref_file.name, - "type": "reference", - "version": metadata.version, - }, - } + doc_meta = { + "source": metadata.name, + "category": category, + "file": ref_file.name, + "type": "reference", + "version": metadata.version, + } + + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + doc_meta, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name ) + # Add all chunks as documents + for chunk_text, chunk_meta in chunks: + documents.append({ + "content": chunk_text, + "meta": chunk_meta, + }) + # Return as formatted JSON return json.dumps(documents, indent=2, ensure_ascii=False) diff --git a/src/skill_seekers/cli/adaptors/llama_index.py b/src/skill_seekers/cli/adaptors/llama_index.py index 8452ca3..994985f 100644 --- a/src/skill_seekers/cli/adaptors/llama_index.py +++ b/src/skill_seekers/cli/adaptors/llama_index.py @@ -62,6 +62,8 @@ class LlamaIndexAdaptor(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks) Returns: JSON string containing array of LlamaIndex Nodes @@ -80,15 +82,26 @@ class LlamaIndexAdaptor(SkillAdaptor): "type": "documentation", "version": metadata.version, } - nodes.append( - { - "text": content, - "metadata": node_metadata, - "id_": self._generate_node_id(content, node_metadata), - "embedding": None, - } + + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + node_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" ) + # Add all chunks as nodes + for chunk_text, chunk_meta in chunks: + nodes.append({ + "text": chunk_text, + "metadata": chunk_meta, + "id_": self._generate_node_id(chunk_text, chunk_meta), + "embedding": None, + }) + # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): if ref_content.strip(): @@ -103,15 +116,25 @@ class LlamaIndexAdaptor(SkillAdaptor): "version": metadata.version, } - nodes.append( - { - "text": ref_content, - "metadata": node_metadata, - "id_": self._generate_node_id(ref_content, node_metadata), - "embedding": None, - } + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + node_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name ) + # Add all chunks as nodes + for chunk_text, chunk_meta in chunks: + nodes.append({ + "text": chunk_text, + "metadata": chunk_meta, + "id_": self._generate_node_id(chunk_text, chunk_meta), + "embedding": None, + }) + # Return as formatted JSON return json.dumps(nodes, indent=2, ensure_ascii=False) diff --git a/src/skill_seekers/cli/adaptors/qdrant.py b/src/skill_seekers/cli/adaptors/qdrant.py index a5b79be..ac7dab5 100644 --- a/src/skill_seekers/cli/adaptors/qdrant.py +++ b/src/skill_seekers/cli/adaptors/qdrant.py @@ -61,6 +61,8 @@ class QdrantAdaptor(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters Returns: JSON string containing Qdrant-compatible data @@ -72,46 +74,86 @@ class QdrantAdaptor(SkillAdaptor): if skill_md_path.exists(): content = self._read_existing_content(skill_dir) if content.strip(): - point_id = self._generate_point_id(content, { + payload_meta = { "source": metadata.name, - "file": "SKILL.md" - }) + "category": "overview", + "file": "SKILL.md", + "type": "documentation", + "version": metadata.version, + } - points.append({ - "id": point_id, - "vector": None, # User will generate embeddings - "payload": { - "content": content, - "source": metadata.name, - "category": "overview", - "file": "SKILL.md", - "type": "documentation", - "version": metadata.version, - } - }) + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + payload_meta, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" + ) + + # Add all chunks as points + for chunk_text, chunk_meta in chunks: + point_id = self._generate_point_id(chunk_text, { + "source": chunk_meta.get("source", metadata.name), + "file": chunk_meta.get("file", "SKILL.md") + }) + + points.append({ + "id": point_id, + "vector": None, # User will generate embeddings + "payload": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", "overview"), + "file": chunk_meta.get("file", "SKILL.md"), + "type": chunk_meta.get("type", "documentation"), + "version": chunk_meta.get("version", metadata.version), + } + }) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): if ref_content.strip(): category = ref_file.stem.replace("_", " ").lower() - point_id = self._generate_point_id(ref_content, { + payload_meta = { "source": metadata.name, - "file": ref_file.name - }) + "category": category, + "file": ref_file.name, + "type": "reference", + "version": metadata.version, + } - points.append({ - "id": point_id, - "vector": None, # User will generate embeddings - "payload": { - "content": ref_content, - "source": metadata.name, - "category": category, - "file": ref_file.name, - "type": "reference", - "version": metadata.version, - } - }) + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + payload_meta, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name + ) + + # Add all chunks as points + for chunk_text, chunk_meta in chunks: + point_id = self._generate_point_id(chunk_text, { + "source": chunk_meta.get("source", metadata.name), + "file": chunk_meta.get("file", ref_file.name) + }) + + points.append({ + "id": point_id, + "vector": None, # User will generate embeddings + "payload": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", category), + "file": chunk_meta.get("file", ref_file.name), + "type": chunk_meta.get("type", "reference"), + "version": chunk_meta.get("version", metadata.version), + } + }) # Qdrant configuration config = { diff --git a/src/skill_seekers/cli/adaptors/weaviate.py b/src/skill_seekers/cli/adaptors/weaviate.py index aca17a7..6628631 100644 --- a/src/skill_seekers/cli/adaptors/weaviate.py +++ b/src/skill_seekers/cli/adaptors/weaviate.py @@ -122,6 +122,8 @@ class WeaviateAdaptor(SkillAdaptor): Args: skill_dir: Path to skill directory metadata: Skill metadata + enable_chunking: Enable intelligent chunking for large documents + **kwargs: Additional chunking parameters Returns: JSON string containing Weaviate objects and schema @@ -141,20 +143,30 @@ class WeaviateAdaptor(SkillAdaptor): "version": metadata.version, } - objects.append( - { - "id": self._generate_uuid(content, obj_metadata), - "properties": { - "content": content, - "source": obj_metadata["source"], - "category": obj_metadata["category"], - "file": obj_metadata["file"], - "type": obj_metadata["type"], - "version": obj_metadata["version"], - }, - } + # Chunk if enabled + chunks = self._maybe_chunk_content( + content, + obj_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file="SKILL.md" ) + # Add all chunks as objects + for chunk_text, chunk_meta in chunks: + objects.append({ + "id": self._generate_uuid(chunk_text, chunk_meta), + "properties": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", "overview"), + "file": chunk_meta.get("file", "SKILL.md"), + "type": chunk_meta.get("type", "documentation"), + "version": chunk_meta.get("version", metadata.version), + }, + }) + # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): if ref_content.strip(): @@ -169,20 +181,30 @@ class WeaviateAdaptor(SkillAdaptor): "version": metadata.version, } - objects.append( - { - "id": self._generate_uuid(ref_content, obj_metadata), - "properties": { - "content": ref_content, - "source": obj_metadata["source"], - "category": obj_metadata["category"], - "file": obj_metadata["file"], - "type": obj_metadata["type"], - "version": obj_metadata["version"], - }, - } + # Chunk if enabled + chunks = self._maybe_chunk_content( + ref_content, + obj_metadata, + enable_chunking=enable_chunking, + chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), + preserve_code_blocks=kwargs.get('preserve_code_blocks', True), + source_file=ref_file.name ) + # Add all chunks as objects + for chunk_text, chunk_meta in chunks: + objects.append({ + "id": self._generate_uuid(chunk_text, chunk_meta), + "properties": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", category), + "file": chunk_meta.get("file", ref_file.name), + "type": chunk_meta.get("type", "reference"), + "version": chunk_meta.get("version", metadata.version), + }, + }) + # Generate schema class_name = "".join(word.capitalize() for word in metadata.name.split("_")) schema = self._generate_schema(class_name)