refactor: Adopt helper methods across 7 RAG adaptors to eliminate duplication

Refactored all RAG adaptors (LangChain, LlamaIndex, Haystack, Weaviate, Chroma,
FAISS, Qdrant) to use existing helper methods from base.py, removing ~215 lines
of duplicate code (26% reduction).

Key improvements:
- All adaptors now use _format_output_path() for consistent path handling
- All adaptors now use _iterate_references() for reference file iteration
- Added _generate_deterministic_id() helper with 3 formats (hex, uuid, uuid5)
- 5 adaptors refactored to use unified ID generation
- Removed 6 unused imports (hashlib, uuid)

Benefits:
- DRY principles enforced across all RAG adaptors
- Single source of truth for common logic
- Easier maintenance and testing
- Consistent behavior across platforms

All 159 adaptor tests passing. Zero regressions.

Phase 1 of optional enhancements (Phases 2-5 pending).

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-07 22:31:10 +03:00
parent ffe8fc4de2
commit d84e5878a1
9 changed files with 489 additions and 304 deletions

View File

@@ -9,8 +9,6 @@ Qdrant stores vectors and metadata together in collections with points.
import json
from pathlib import Path
from typing import Any
import hashlib
import uuid
from .base import SkillAdaptor, SkillMetadata
@@ -43,10 +41,7 @@ class QdrantAdaptor(SkillAdaptor):
Returns:
UUID string (version 5, deterministic)
"""
# Use content hash + source for deterministic UUID
namespace = uuid.UUID("00000000-0000-0000-0000-000000000000")
id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}"
return str(uuid.uuid5(namespace, id_string))
return self._generate_deterministic_id(content, metadata, format="uuid5")
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
"""
@@ -89,36 +84,28 @@ class QdrantAdaptor(SkillAdaptor):
}
})
# Convert all reference files
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in sorted(refs_dir.glob("*.md")):
if ref_file.is_file() and not ref_file.name.startswith("."):
try:
ref_content = ref_file.read_text(encoding="utf-8")
if ref_content.strip():
category = ref_file.stem.replace("_", " ").lower()
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
category = ref_file.stem.replace("_", " ").lower()
point_id = self._generate_point_id(ref_content, {
"source": metadata.name,
"file": ref_file.name
})
point_id = self._generate_point_id(ref_content, {
"source": metadata.name,
"file": ref_file.name
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": ref_content,
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
})
except Exception as e:
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
continue
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": ref_content,
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
})
# Qdrant configuration
config = {
@@ -158,18 +145,8 @@ class QdrantAdaptor(SkillAdaptor):
"""
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-qdrant.json"
elif not str(output_path).endswith(".json"):
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
if not output_str.endswith("-qdrant.json"):
output_str = output_str.replace(".json", "-qdrant.json")
if not output_str.endswith(".json"):
output_str += ".json"
output_path = Path(output_str)
output_path = Path(output_path)
# Determine output filename using base helper method
output_path = self._format_output_path(skill_dir, Path(output_path), "-qdrant.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata