style: Format all Python files with ruff
- Formatted 103 files to comply with ruff format requirements - No code logic changes, only formatting/whitespace - Fixes CI formatting check failures
This commit is contained in:
@@ -33,9 +33,9 @@ from .runner import BenchmarkRunner
|
||||
from .models import BenchmarkReport, Metric
|
||||
|
||||
__all__ = [
|
||||
'Benchmark',
|
||||
'BenchmarkResult',
|
||||
'BenchmarkRunner',
|
||||
'BenchmarkReport',
|
||||
'Metric',
|
||||
"Benchmark",
|
||||
"BenchmarkResult",
|
||||
"BenchmarkRunner",
|
||||
"BenchmarkReport",
|
||||
"Metric",
|
||||
]
|
||||
|
||||
@@ -11,12 +11,7 @@ from typing import Any
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from .models import (
|
||||
Metric,
|
||||
TimingResult,
|
||||
MemoryUsage,
|
||||
BenchmarkReport
|
||||
)
|
||||
from .models import Metric, TimingResult, MemoryUsage, BenchmarkReport
|
||||
|
||||
|
||||
class BenchmarkResult:
|
||||
@@ -97,7 +92,7 @@ class BenchmarkResult:
|
||||
memory=self.memory,
|
||||
metrics=self.metrics,
|
||||
system_info=self.system_info,
|
||||
recommendations=self.recommendations
|
||||
recommendations=self.recommendations,
|
||||
)
|
||||
|
||||
|
||||
@@ -161,7 +156,7 @@ class Benchmark:
|
||||
operation=operation,
|
||||
duration=duration,
|
||||
iterations=iterations,
|
||||
avg_duration=duration / iterations if iterations > 1 else duration
|
||||
avg_duration=duration / iterations if iterations > 1 else duration,
|
||||
)
|
||||
|
||||
self.result.add_timing(timing)
|
||||
@@ -201,7 +196,7 @@ class Benchmark:
|
||||
before_mb=mem_before,
|
||||
after_mb=mem_after,
|
||||
peak_mb=peak_memory,
|
||||
allocated_mb=mem_after - mem_before
|
||||
allocated_mb=mem_after - mem_before,
|
||||
)
|
||||
|
||||
self.result.add_memory(usage)
|
||||
@@ -212,7 +207,7 @@ class Benchmark:
|
||||
*args,
|
||||
operation: str | None = None,
|
||||
track_memory: bool = False,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
) -> Any:
|
||||
"""
|
||||
Measure function execution.
|
||||
@@ -260,17 +255,16 @@ class Benchmark:
|
||||
def load_config(path):
|
||||
return json.load(open(path))
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
return self.measure(
|
||||
func,
|
||||
*args,
|
||||
operation=operation,
|
||||
track_memory=track_memory,
|
||||
**kwargs
|
||||
func, *args, operation=operation, track_memory=track_memory, **kwargs
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
def metric(self, name: str, value: float, unit: str):
|
||||
@@ -285,11 +279,7 @@ class Benchmark:
|
||||
Examples:
|
||||
benchmark.metric("pages_per_sec", 12.5, "pages/sec")
|
||||
"""
|
||||
metric = Metric(
|
||||
name=name,
|
||||
value=value,
|
||||
unit=unit
|
||||
)
|
||||
metric = Metric(name=name, value=value, unit=unit)
|
||||
self.result.add_metric(metric)
|
||||
|
||||
def recommend(self, text: str):
|
||||
@@ -328,7 +318,7 @@ class Benchmark:
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(path, 'w') as f:
|
||||
with open(path, "w") as f:
|
||||
f.write(report.model_dump_json(indent=2))
|
||||
|
||||
def analyze(self):
|
||||
@@ -339,11 +329,7 @@ class Benchmark:
|
||||
"""
|
||||
# Analyze timing bottlenecks
|
||||
if self.result.timings:
|
||||
sorted_timings = sorted(
|
||||
self.result.timings,
|
||||
key=lambda t: t.duration,
|
||||
reverse=True
|
||||
)
|
||||
sorted_timings = sorted(self.result.timings, key=lambda t: t.duration, reverse=True)
|
||||
|
||||
slowest = sorted_timings[0]
|
||||
total_time = sum(t.duration for t in self.result.timings)
|
||||
@@ -351,7 +337,7 @@ class Benchmark:
|
||||
if slowest.duration > total_time * 0.5:
|
||||
self.recommend(
|
||||
f"Bottleneck: '{slowest.operation}' takes "
|
||||
f"{slowest.duration:.1f}s ({slowest.duration/total_time*100:.0f}% of total)"
|
||||
f"{slowest.duration:.1f}s ({slowest.duration / total_time * 100:.0f}% of total)"
|
||||
)
|
||||
|
||||
# Analyze memory usage
|
||||
@@ -360,8 +346,7 @@ class Benchmark:
|
||||
|
||||
if peak > 1000: # >1GB
|
||||
self.recommend(
|
||||
f"High memory usage: {peak:.0f}MB peak. "
|
||||
"Consider processing in batches."
|
||||
f"High memory usage: {peak:.0f}MB peak. Consider processing in batches."
|
||||
)
|
||||
|
||||
# Check for memory leaks
|
||||
|
||||
@@ -14,8 +14,7 @@ class Metric(BaseModel):
|
||||
value: float = Field(..., description="Metric value")
|
||||
unit: str = Field(..., description="Unit (seconds, bytes, pages/sec, etc.)")
|
||||
timestamp: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="When metric was recorded"
|
||||
default_factory=datetime.utcnow, description="When metric was recorded"
|
||||
)
|
||||
|
||||
|
||||
@@ -48,26 +47,13 @@ class BenchmarkReport(BaseModel):
|
||||
finished_at: datetime = Field(..., description="Finish time")
|
||||
total_duration: float = Field(..., description="Total duration in seconds")
|
||||
|
||||
timings: list[TimingResult] = Field(
|
||||
default_factory=list,
|
||||
description="Timing results"
|
||||
)
|
||||
memory: list[MemoryUsage] = Field(
|
||||
default_factory=list,
|
||||
description="Memory usage results"
|
||||
)
|
||||
metrics: list[Metric] = Field(
|
||||
default_factory=list,
|
||||
description="Additional metrics"
|
||||
)
|
||||
timings: list[TimingResult] = Field(default_factory=list, description="Timing results")
|
||||
memory: list[MemoryUsage] = Field(default_factory=list, description="Memory usage results")
|
||||
metrics: list[Metric] = Field(default_factory=list, description="Additional metrics")
|
||||
|
||||
system_info: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="System information"
|
||||
)
|
||||
system_info: dict[str, Any] = Field(default_factory=dict, description="System information")
|
||||
recommendations: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Optimization recommendations"
|
||||
default_factory=list, description="Optimization recommendations"
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -89,14 +75,8 @@ class ComparisonReport(BaseModel):
|
||||
baseline: BenchmarkReport = Field(..., description="Baseline benchmark")
|
||||
current: BenchmarkReport = Field(..., description="Current benchmark")
|
||||
|
||||
improvements: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Performance improvements"
|
||||
)
|
||||
regressions: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Performance regressions"
|
||||
)
|
||||
improvements: list[str] = Field(default_factory=list, description="Performance improvements")
|
||||
regressions: list[str] = Field(default_factory=list, description="Performance regressions")
|
||||
|
||||
speedup_factor: float = Field(..., description="Overall speedup factor")
|
||||
memory_change_mb: float = Field(..., description="Memory usage change (MB)")
|
||||
|
||||
@@ -46,10 +46,7 @@ class BenchmarkRunner:
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def run(
|
||||
self,
|
||||
name: str,
|
||||
benchmark_func: Callable[[Benchmark], None],
|
||||
save: bool = True
|
||||
self, name: str, benchmark_func: Callable[[Benchmark], None], save: bool = True
|
||||
) -> BenchmarkReport:
|
||||
"""
|
||||
Run single benchmark.
|
||||
@@ -83,7 +80,7 @@ class BenchmarkRunner:
|
||||
filename = f"{name}_{timestamp}.json"
|
||||
path = self.output_dir / filename
|
||||
|
||||
with open(path, 'w') as f:
|
||||
with open(path, "w") as f:
|
||||
f.write(report.model_dump_json(indent=2))
|
||||
|
||||
print(f"📊 Saved benchmark: {path}")
|
||||
@@ -91,9 +88,7 @@ class BenchmarkRunner:
|
||||
return report
|
||||
|
||||
def run_suite(
|
||||
self,
|
||||
benchmarks: dict[str, Callable[[Benchmark], None]],
|
||||
save: bool = True
|
||||
self, benchmarks: dict[str, Callable[[Benchmark], None]], save: bool = True
|
||||
) -> dict[str, BenchmarkReport]:
|
||||
"""
|
||||
Run multiple benchmarks.
|
||||
@@ -122,11 +117,7 @@ class BenchmarkRunner:
|
||||
|
||||
return reports
|
||||
|
||||
def compare(
|
||||
self,
|
||||
baseline_path: Path,
|
||||
current_path: Path
|
||||
) -> ComparisonReport:
|
||||
def compare(self, baseline_path: Path, current_path: Path) -> ComparisonReport:
|
||||
"""
|
||||
Compare two benchmark reports.
|
||||
|
||||
@@ -215,7 +206,7 @@ class BenchmarkRunner:
|
||||
improvements=improvements,
|
||||
regressions=regressions,
|
||||
speedup_factor=speedup_factor,
|
||||
memory_change_mb=memory_change_mb
|
||||
memory_change_mb=memory_change_mb,
|
||||
)
|
||||
|
||||
def list_benchmarks(self) -> list[dict[str, Any]]:
|
||||
@@ -237,13 +228,15 @@ class BenchmarkRunner:
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
benchmarks.append({
|
||||
"name": data["name"],
|
||||
"path": str(path),
|
||||
"started_at": data["started_at"],
|
||||
"duration": data["total_duration"],
|
||||
"operations": len(data.get("timings", []))
|
||||
})
|
||||
benchmarks.append(
|
||||
{
|
||||
"name": data["name"],
|
||||
"path": str(path),
|
||||
"started_at": data["started_at"],
|
||||
"duration": data["total_duration"],
|
||||
"operations": len(data.get("timings", [])),
|
||||
}
|
||||
)
|
||||
except Exception:
|
||||
# Skip invalid files
|
||||
continue
|
||||
|
||||
@@ -74,7 +74,7 @@ class SkillAdaptor(ABC):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill for platform (ZIP, tar.gz, etc.).
|
||||
@@ -282,7 +282,7 @@ class SkillAdaptor(ABC):
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
source_file: str = None
|
||||
source_file: str = None,
|
||||
) -> list[tuple[str, dict]]:
|
||||
"""
|
||||
Optionally chunk content for RAG platforms.
|
||||
@@ -326,33 +326,31 @@ class SkillAdaptor(ABC):
|
||||
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
preserve_paragraphs=True,
|
||||
min_chunk_size=100 # 100 tokens minimum
|
||||
min_chunk_size=100, # 100 tokens minimum
|
||||
)
|
||||
|
||||
# Chunk the document
|
||||
chunks = chunker.chunk_document(
|
||||
text=content,
|
||||
metadata=metadata,
|
||||
source_file=source_file or metadata.get('file', 'unknown')
|
||||
source_file=source_file or metadata.get("file", "unknown"),
|
||||
)
|
||||
|
||||
# Convert RAGChunker output format to (text, metadata) tuples
|
||||
result = []
|
||||
for chunk_dict in chunks:
|
||||
chunk_text = chunk_dict['page_content']
|
||||
chunk_text = chunk_dict["page_content"]
|
||||
chunk_meta = {
|
||||
**metadata, # Base metadata
|
||||
**chunk_dict['metadata'], # RAGChunker metadata (chunk_index, etc.)
|
||||
'is_chunked': True,
|
||||
'chunk_id': chunk_dict['chunk_id']
|
||||
**chunk_dict["metadata"], # RAGChunker metadata (chunk_index, etc.)
|
||||
"is_chunked": True,
|
||||
"chunk_id": chunk_dict["chunk_id"],
|
||||
}
|
||||
result.append((chunk_text, chunk_meta))
|
||||
|
||||
return result
|
||||
|
||||
def _format_output_path(
|
||||
self, skill_dir: Path, output_path: Path, suffix: str
|
||||
) -> Path:
|
||||
def _format_output_path(self, skill_dir: Path, output_path: Path, suffix: str) -> Path:
|
||||
"""
|
||||
Generate standardized output path with intelligent format handling.
|
||||
|
||||
@@ -379,11 +377,13 @@ class SkillAdaptor(ABC):
|
||||
output_str = str(output_path)
|
||||
|
||||
# Extract the file extension from suffix (e.g., ".json" from "-langchain.json")
|
||||
correct_ext = suffix.split('.')[-1] if '.' in suffix else ''
|
||||
correct_ext = suffix.split(".")[-1] if "." in suffix else ""
|
||||
|
||||
if correct_ext and not output_str.endswith(f".{correct_ext}"):
|
||||
# Replace common incorrect extensions
|
||||
output_str = output_str.replace(".zip", f".{correct_ext}").replace(".tar.gz", f".{correct_ext}")
|
||||
output_str = output_str.replace(".zip", f".{correct_ext}").replace(
|
||||
".tar.gz", f".{correct_ext}"
|
||||
)
|
||||
|
||||
# Ensure platform suffix is present
|
||||
if not output_str.endswith(suffix):
|
||||
@@ -395,9 +395,7 @@ class SkillAdaptor(ABC):
|
||||
|
||||
return Path(output_str)
|
||||
|
||||
def _generate_deterministic_id(
|
||||
self, content: str, metadata: dict, format: str = "hex"
|
||||
) -> str:
|
||||
def _generate_deterministic_id(self, content: str, metadata: dict, format: str = "hex") -> str:
|
||||
"""
|
||||
Generate deterministic ID from content and metadata.
|
||||
|
||||
|
||||
@@ -43,11 +43,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Chroma ingestion.
|
||||
@@ -90,9 +86,9 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -120,9 +116,9 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -149,7 +145,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Chroma.
|
||||
@@ -183,7 +179,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -233,7 +229,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "chromadb not installed. Run: pip install chromadb"
|
||||
"message": "chromadb not installed. Run: pip install chromadb",
|
||||
}
|
||||
|
||||
# Load package
|
||||
@@ -241,8 +237,8 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
data = json.load(f)
|
||||
|
||||
# Determine client type and configuration
|
||||
persist_directory = kwargs.get('persist_directory')
|
||||
chroma_url = kwargs.get('chroma_url')
|
||||
persist_directory = kwargs.get("persist_directory")
|
||||
chroma_url = kwargs.get("chroma_url")
|
||||
|
||||
try:
|
||||
if persist_directory:
|
||||
@@ -253,15 +249,15 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
# Remote HTTP client
|
||||
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
|
||||
# Parse URL
|
||||
if '://' in chroma_url:
|
||||
parts = chroma_url.split('://')
|
||||
if "://" in chroma_url:
|
||||
parts = chroma_url.split("://")
|
||||
parts[0]
|
||||
host_port = parts[1]
|
||||
else:
|
||||
host_port = chroma_url
|
||||
|
||||
if ':' in host_port:
|
||||
host, port = host_port.rsplit(':', 1)
|
||||
if ":" in host_port:
|
||||
host, port = host_port.rsplit(":", 1)
|
||||
port = int(port)
|
||||
else:
|
||||
host = host_port
|
||||
@@ -276,12 +272,12 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server"
|
||||
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server",
|
||||
}
|
||||
|
||||
# Get or create collection
|
||||
collection_name = kwargs.get('collection_name', data.get('collection_name', 'skill_docs'))
|
||||
distance_function = kwargs.get('distance_function', 'cosine')
|
||||
collection_name = kwargs.get("collection_name", data.get("collection_name", "skill_docs"))
|
||||
distance_function = kwargs.get("distance_function", "cosine")
|
||||
|
||||
try:
|
||||
# Try to get existing collection
|
||||
@@ -291,62 +287,57 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
try:
|
||||
# Create new collection
|
||||
metadata = {"hnsw:space": distance_function}
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata=metadata
|
||||
)
|
||||
collection = client.create_collection(name=collection_name, metadata=metadata)
|
||||
print(f"✅ Created collection: {collection_name} (distance: {distance_function})")
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to create collection '{collection_name}': {e}"
|
||||
"message": f"Failed to create collection '{collection_name}': {e}",
|
||||
}
|
||||
|
||||
# Handle embeddings
|
||||
embedding_function = kwargs.get('embedding_function')
|
||||
embedding_function = kwargs.get("embedding_function")
|
||||
|
||||
try:
|
||||
if embedding_function == 'openai':
|
||||
if embedding_function == "openai":
|
||||
# Generate embeddings with OpenAI
|
||||
print("🔄 Generating OpenAI embeddings...")
|
||||
embeddings = self._generate_openai_embeddings(
|
||||
data['documents'],
|
||||
api_key=kwargs.get('openai_api_key')
|
||||
data["documents"], api_key=kwargs.get("openai_api_key")
|
||||
)
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids'],
|
||||
embeddings=embeddings
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"],
|
||||
embeddings=embeddings,
|
||||
)
|
||||
elif embedding_function == 'sentence-transformers':
|
||||
elif embedding_function == "sentence-transformers":
|
||||
# Use sentence-transformers
|
||||
print("🔄 Generating sentence-transformer embeddings...")
|
||||
try:
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
ef = embedding_functions.SentenceTransformerEmbeddingFunction()
|
||||
embeddings = [ef([doc])[0] for doc in data['documents']]
|
||||
embeddings = [ef([doc])[0] for doc in data["documents"]]
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids'],
|
||||
embeddings=embeddings
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"],
|
||||
embeddings=embeddings,
|
||||
)
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
|
||||
}
|
||||
else:
|
||||
# No embeddings - Chroma will auto-generate
|
||||
print("🔄 Using Chroma's default embedding function...")
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids']
|
||||
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
|
||||
)
|
||||
|
||||
count = len(data['documents'])
|
||||
count = len(data["documents"])
|
||||
print(f"✅ Uploaded {count} documents to ChromaDB")
|
||||
print(f"📊 Collection '{collection_name}' now has {collection.count()} total documents")
|
||||
|
||||
@@ -355,19 +346,14 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"message": f"Uploaded {count} documents to ChromaDB collection '{collection_name}'",
|
||||
"collection": collection_name,
|
||||
"count": count,
|
||||
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None
|
||||
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Upload failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self,
|
||||
documents: list[str],
|
||||
api_key: str = None
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
@@ -380,12 +366,13 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
@@ -398,14 +385,14 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i+batch_size]
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small" # Cheapest, fastest
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Processed {min(i+batch_size, len(documents))}/{len(documents)}")
|
||||
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
|
||||
@@ -81,7 +81,14 @@ version: {metadata.version}
|
||||
{content_body}
|
||||
"""
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for Claude.
|
||||
|
||||
|
||||
@@ -46,11 +46,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for FAISS ingestion.
|
||||
@@ -92,9 +88,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -121,9 +117,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -160,7 +156,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for FAISS.
|
||||
@@ -193,7 +189,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -86,7 +86,14 @@ See the references directory for complete documentation with examples and best p
|
||||
# Return plain markdown (NO frontmatter)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into tar.gz file for Gemini.
|
||||
|
||||
|
||||
@@ -29,11 +29,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
DEFAULT_API_ENDPOINT = None # No upload endpoint
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of Haystack Documents.
|
||||
@@ -73,17 +69,19 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
documents.append(
|
||||
{
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -104,17 +102,19 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
documents.append(
|
||||
{
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
}
|
||||
)
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||
@@ -125,7 +125,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Haystack.
|
||||
@@ -159,7 +159,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -29,11 +29,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
DEFAULT_API_ENDPOINT = None # No upload endpoint
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of LangChain Documents.
|
||||
@@ -73,17 +69,14 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": chunk_meta
|
||||
})
|
||||
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -104,17 +97,14 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": chunk_meta
|
||||
})
|
||||
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||
@@ -125,7 +115,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LangChain.
|
||||
@@ -162,7 +152,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -42,11 +42,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of LlamaIndex Nodes.
|
||||
@@ -88,19 +84,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
nodes.append(
|
||||
{
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -121,19 +119,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
nodes.append(
|
||||
{
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(nodes, indent=2, ensure_ascii=False)
|
||||
@@ -144,7 +144,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LlamaIndex.
|
||||
@@ -178,7 +178,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -81,7 +81,14 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
# Return pure markdown (no frontmatter, no special formatting)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file with markdown documentation.
|
||||
|
||||
|
||||
@@ -103,7 +103,14 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
# Return plain text instructions (NO frontmatter)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for OpenAI Assistants.
|
||||
|
||||
|
||||
@@ -44,11 +44,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="uuid5")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as Qdrant collection JSON.
|
||||
@@ -87,30 +83,35 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", "SKILL.md")
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
point_id = self._generate_point_id(
|
||||
chunk_text,
|
||||
{
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
)
|
||||
|
||||
points.append(
|
||||
{
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -130,30 +131,35 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", ref_file.name)
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
point_id = self._generate_point_id(
|
||||
chunk_text,
|
||||
{
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
)
|
||||
|
||||
points.append(
|
||||
{
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
# Qdrant configuration
|
||||
config = {
|
||||
@@ -184,7 +190,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Qdrant.
|
||||
@@ -217,7 +223,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -36,7 +36,7 @@ class StreamingAdaptorMixin:
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
batch_size: int = 100,
|
||||
progress_callback: callable | None = None
|
||||
progress_callback: callable | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill using streaming ingestion.
|
||||
@@ -60,9 +60,7 @@ class StreamingAdaptorMixin:
|
||||
|
||||
# Initialize streaming ingester
|
||||
ingester = StreamingIngester(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap, batch_size=batch_size
|
||||
)
|
||||
|
||||
print(f"\n📊 Streaming ingestion starting...")
|
||||
@@ -77,9 +75,11 @@ class StreamingAdaptorMixin:
|
||||
nonlocal last_update
|
||||
# Update every 10 chunks
|
||||
if progress.processed_chunks - last_update >= 10:
|
||||
print(f" {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
|
||||
f"({progress.chunks_per_second:.1f} chunks/sec)")
|
||||
print(
|
||||
f" {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
|
||||
f"({progress.chunks_per_second:.1f} chunks/sec)"
|
||||
)
|
||||
last_update = progress.processed_chunks
|
||||
|
||||
if progress_callback:
|
||||
@@ -97,10 +97,7 @@ class StreamingAdaptorMixin:
|
||||
|
||||
# Convert chunks to platform format
|
||||
print(f"\n📦 Converting to {self.PLATFORM_NAME} format...")
|
||||
package_data = self._convert_chunks_to_platform_format(
|
||||
all_chunks,
|
||||
skill_dir.name
|
||||
)
|
||||
package_data = self._convert_chunks_to_platform_format(all_chunks, skill_dir.name)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
@@ -114,8 +111,7 @@ class StreamingAdaptorMixin:
|
||||
# Write output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
json.dumps(package_data, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8"
|
||||
json.dumps(package_data, indent=2, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
print(f"✅ Package created: {output_path}")
|
||||
@@ -124,9 +120,7 @@ class StreamingAdaptorMixin:
|
||||
return output_path
|
||||
|
||||
def _convert_chunks_to_platform_format(
|
||||
self,
|
||||
chunks: list[tuple[str, dict]],
|
||||
skill_name: str
|
||||
self, chunks: list[tuple[str, dict]], skill_name: str
|
||||
) -> dict:
|
||||
"""
|
||||
Convert chunks to platform-specific format.
|
||||
@@ -156,14 +150,11 @@ class StreamingAdaptorMixin:
|
||||
"metadatas": metadatas,
|
||||
"ids": ids,
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
def estimate_chunks(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200
|
||||
self, skill_dir: Path, chunk_size: int = 4000, chunk_overlap: int = 200
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Estimate chunking for a skill directory.
|
||||
@@ -179,10 +170,7 @@ class StreamingAdaptorMixin:
|
||||
Estimation statistics
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
StreamingIngester(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap
|
||||
)
|
||||
StreamingIngester(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
# Count files and estimate chunks
|
||||
total_docs = 0
|
||||
@@ -201,11 +189,9 @@ class StreamingAdaptorMixin:
|
||||
total_chars += char_count
|
||||
estimated_chunks += chunk_count
|
||||
|
||||
file_stats.append({
|
||||
"file": "SKILL.md",
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count
|
||||
})
|
||||
file_stats.append(
|
||||
{"file": "SKILL.md", "chars": char_count, "estimated_chunks": chunk_count}
|
||||
)
|
||||
|
||||
# Reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -214,17 +200,21 @@ class StreamingAdaptorMixin:
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
char_count = len(content)
|
||||
chunk_count = max(1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1)
|
||||
chunk_count = max(
|
||||
1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1
|
||||
)
|
||||
|
||||
total_docs += 1
|
||||
total_chars += char_count
|
||||
estimated_chunks += chunk_count
|
||||
|
||||
file_stats.append({
|
||||
"file": ref_file.name,
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count
|
||||
})
|
||||
file_stats.append(
|
||||
{
|
||||
"file": ref_file.name,
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"skill_name": skill_dir.name,
|
||||
@@ -235,7 +225,7 @@ class StreamingAdaptorMixin:
|
||||
"chunk_overlap": chunk_overlap,
|
||||
"file_stats": file_stats,
|
||||
"estimated_memory_mb": (total_chars * 2) / (1024 * 1024), # UTF-8 estimate
|
||||
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100
|
||||
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100,
|
||||
}
|
||||
|
||||
|
||||
@@ -251,25 +241,27 @@ class StreamingLangChainAdaptor(StreamingAdaptorMixin):
|
||||
documents = []
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": {
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_id": chunk_meta["chunk_id"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", "1.0.0"),
|
||||
documents.append(
|
||||
{
|
||||
"page_content": chunk_text,
|
||||
"metadata": {
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_id": chunk_meta["chunk_id"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", "1.0.0"),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
return {
|
||||
"documents": documents,
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True,
|
||||
"format": "LangChain Document"
|
||||
"format": "LangChain Document",
|
||||
}
|
||||
|
||||
|
||||
@@ -287,14 +279,16 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append({
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
})
|
||||
metadatas.append(
|
||||
{
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
}
|
||||
)
|
||||
ids.append(chunk_meta["chunk_id"])
|
||||
|
||||
return {
|
||||
@@ -303,7 +297,7 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
|
||||
"ids": ids,
|
||||
"collection_name": skill_name.replace("_", "-"),
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
|
||||
@@ -339,11 +333,7 @@ def demo_streaming():
|
||||
print("=" * 60)
|
||||
|
||||
output = adaptor.package_streaming(
|
||||
skill_dir,
|
||||
Path("output"),
|
||||
chunk_size=2000,
|
||||
chunk_overlap=100,
|
||||
batch_size=50
|
||||
skill_dir, Path("output"), chunk_size=2000, chunk_overlap=100, batch_size=50
|
||||
)
|
||||
|
||||
print(f"\n✅ Complete! Output: {output}")
|
||||
|
||||
@@ -104,11 +104,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
}
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Weaviate ingestion.
|
||||
@@ -148,24 +144,26 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -186,24 +184,26 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Generate schema
|
||||
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
|
||||
@@ -222,7 +222,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Weaviate.
|
||||
@@ -258,7 +258,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -310,7 +310,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "weaviate-client not installed. Run: pip install weaviate-client"
|
||||
"message": "weaviate-client not installed. Run: pip install weaviate-client",
|
||||
}
|
||||
|
||||
# Load package
|
||||
@@ -319,16 +319,16 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
|
||||
# Connect to Weaviate
|
||||
try:
|
||||
if kwargs.get('use_cloud') and api_key:
|
||||
if kwargs.get("use_cloud") and api_key:
|
||||
# Weaviate Cloud
|
||||
print(f"🌐 Connecting to Weaviate Cloud: {kwargs.get('cluster_url')}")
|
||||
client = weaviate.Client(
|
||||
url=kwargs.get('cluster_url'),
|
||||
auth_client_secret=weaviate.AuthApiKey(api_key=api_key)
|
||||
url=kwargs.get("cluster_url"),
|
||||
auth_client_secret=weaviate.AuthApiKey(api_key=api_key),
|
||||
)
|
||||
else:
|
||||
# Local Weaviate instance
|
||||
weaviate_url = kwargs.get('weaviate_url', 'http://localhost:8080')
|
||||
weaviate_url = kwargs.get("weaviate_url", "http://localhost:8080")
|
||||
print(f"🌐 Connecting to Weaviate at: {weaviate_url}")
|
||||
client = weaviate.Client(url=weaviate_url)
|
||||
|
||||
@@ -336,69 +336,67 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
if not client.is_ready():
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest"
|
||||
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials."
|
||||
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials.",
|
||||
}
|
||||
|
||||
# Create schema
|
||||
try:
|
||||
client.schema.create_class(data['schema'])
|
||||
client.schema.create_class(data["schema"])
|
||||
print(f"✅ Created schema: {data['class_name']}")
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
print(f"ℹ️ Schema already exists: {data['class_name']}")
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Schema creation failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Schema creation failed: {e}"}
|
||||
|
||||
# Handle embeddings
|
||||
embedding_function = kwargs.get('embedding_function')
|
||||
embedding_function = kwargs.get("embedding_function")
|
||||
|
||||
try:
|
||||
with client.batch as batch:
|
||||
batch.batch_size = 100
|
||||
|
||||
if embedding_function == 'openai':
|
||||
if embedding_function == "openai":
|
||||
# Generate embeddings with OpenAI
|
||||
print("🔄 Generating OpenAI embeddings and uploading...")
|
||||
embeddings = self._generate_openai_embeddings(
|
||||
[obj['properties']['content'] for obj in data['objects']],
|
||||
api_key=kwargs.get('openai_api_key')
|
||||
[obj["properties"]["content"] for obj in data["objects"]],
|
||||
api_key=kwargs.get("openai_api_key"),
|
||||
)
|
||||
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id'],
|
||||
vector=embeddings[i]
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
elif embedding_function == 'sentence-transformers':
|
||||
elif embedding_function == "sentence-transformers":
|
||||
# Use sentence-transformers
|
||||
print("🔄 Generating sentence-transformer embeddings and uploading...")
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
contents = [obj['properties']['content'] for obj in data['objects']]
|
||||
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
contents = [obj["properties"]["content"] for obj in data["objects"]]
|
||||
embeddings = model.encode(contents, show_progress_bar=True).tolist()
|
||||
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id'],
|
||||
vector=embeddings[i]
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
@@ -407,42 +405,37 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
|
||||
}
|
||||
|
||||
else:
|
||||
# No embeddings - Weaviate will use its configured vectorizer
|
||||
print("🔄 Uploading objects (Weaviate will generate embeddings)...")
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id']
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
count = len(data['objects'])
|
||||
count = len(data["objects"])
|
||||
print(f"✅ Upload complete! {count} objects added to Weaviate")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
|
||||
"class_name": data['class_name'],
|
||||
"count": count
|
||||
"class_name": data["class_name"],
|
||||
"count": count,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Upload failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self,
|
||||
documents: list[str],
|
||||
api_key: str = None
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
@@ -455,12 +448,13 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
@@ -473,14 +467,16 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i+batch_size]
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small" # Cheapest, fastest
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Generated {min(i+batch_size, len(documents))}/{len(documents)} embeddings")
|
||||
print(
|
||||
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
|
||||
@@ -101,10 +101,38 @@ class ArchitecturalPatternDetector:
|
||||
# Web Frameworks
|
||||
"Django": ["django", "manage.py", "settings.py", "urls.py"],
|
||||
"Flask": ["flask", "app.py", "wsgi.py"],
|
||||
"Spring": ["springframework", "org.springframework", "@Controller", "@Service", "@Repository"],
|
||||
"ASP.NET": ["Microsoft.AspNetCore", "System.Web", "Controllers", "Models", "Views", ".cshtml", "Startup.cs"],
|
||||
"Rails": ["rails", "action", "app/models", "app/views", "app/controllers", "config/routes.rb"],
|
||||
"Angular": ["@angular", "angular", "app.module.ts", "@Component", "@Injectable", "angular.json"],
|
||||
"Spring": [
|
||||
"springframework",
|
||||
"org.springframework",
|
||||
"@Controller",
|
||||
"@Service",
|
||||
"@Repository",
|
||||
],
|
||||
"ASP.NET": [
|
||||
"Microsoft.AspNetCore",
|
||||
"System.Web",
|
||||
"Controllers",
|
||||
"Models",
|
||||
"Views",
|
||||
".cshtml",
|
||||
"Startup.cs",
|
||||
],
|
||||
"Rails": [
|
||||
"rails",
|
||||
"action",
|
||||
"app/models",
|
||||
"app/views",
|
||||
"app/controllers",
|
||||
"config/routes.rb",
|
||||
],
|
||||
"Angular": [
|
||||
"@angular",
|
||||
"angular",
|
||||
"app.module.ts",
|
||||
"@Component",
|
||||
"@Injectable",
|
||||
"angular.json",
|
||||
],
|
||||
"React": ["react", "package.json", "components"],
|
||||
"Vue.js": ["vue", ".vue", "components"],
|
||||
"Express": ["express", "app.js", "routes"],
|
||||
@@ -208,7 +236,9 @@ class ArchitecturalPatternDetector:
|
||||
|
||||
# Create searchable import string
|
||||
import_content = " ".join(all_imports)
|
||||
logger.debug(f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection")
|
||||
logger.debug(
|
||||
f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection"
|
||||
)
|
||||
|
||||
# Also check actual directory structure for game engine markers
|
||||
# (project.godot, .unity, .uproject are config files, not in analyzed files)
|
||||
@@ -245,7 +275,9 @@ class ArchitecturalPatternDetector:
|
||||
# Check in file paths, directory structure, AND imports
|
||||
path_matches = sum(1 for marker in markers if marker.lower() in all_content.lower())
|
||||
dir_matches = sum(1 for marker in markers if marker.lower() in dir_content.lower())
|
||||
import_matches = sum(1 for marker in markers if marker.lower() in import_content.lower())
|
||||
import_matches = sum(
|
||||
1 for marker in markers if marker.lower() in import_content.lower()
|
||||
)
|
||||
|
||||
# Strategy: Prioritize import-based detection (more accurate)
|
||||
# If we have import matches, they're strong signals - use them alone
|
||||
@@ -257,7 +289,9 @@ class ArchitecturalPatternDetector:
|
||||
elif (path_matches + dir_matches) >= 2:
|
||||
# Path/directory-based detection (requires 2+ matches)
|
||||
detected.append(framework)
|
||||
logger.info(f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})")
|
||||
logger.info(
|
||||
f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})"
|
||||
)
|
||||
|
||||
return detected
|
||||
|
||||
|
||||
@@ -77,7 +77,9 @@ def run_embedding_benchmark(runner, config):
|
||||
with bench.timer("batch_embedding"), bench.memory("batch_embedding"):
|
||||
embeddings = generator.generate_batch(texts, model=model)
|
||||
|
||||
bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec")
|
||||
bench.metric(
|
||||
"embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec"
|
||||
)
|
||||
|
||||
name = config.get("name", "embedding-benchmark")
|
||||
report = runner.run(name, benchmark_func)
|
||||
@@ -97,7 +99,7 @@ def run_storage_benchmark(runner, config):
|
||||
storage = get_storage_adaptor(provider, bucket=bucket)
|
||||
|
||||
# Create test file
|
||||
with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||
with NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
|
||||
f.write("Test data" * 1000)
|
||||
test_file = Path(f.name)
|
||||
|
||||
@@ -128,10 +130,7 @@ def compare_command(args):
|
||||
"""Compare two benchmarks."""
|
||||
runner = BenchmarkRunner()
|
||||
|
||||
comparison = runner.compare(
|
||||
baseline_path=Path(args.baseline),
|
||||
current_path=Path(args.current)
|
||||
)
|
||||
comparison = runner.compare(baseline_path=Path(args.baseline), current_path=Path(args.current))
|
||||
|
||||
print(f"\n📊 Comparison: {comparison.name}\n")
|
||||
print(f"Overall: {comparison.overall_improvement}\n")
|
||||
@@ -213,7 +212,7 @@ def cleanup_command(args):
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Performance benchmarking suite',
|
||||
description="Performance benchmarking suite",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -233,54 +232,46 @@ Examples:
|
||||
|
||||
# Cleanup old benchmarks
|
||||
skill-seekers-benchmark cleanup --keep 5
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
|
||||
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
||||
|
||||
# Run command
|
||||
run_parser = subparsers.add_parser('run', help='Run benchmark')
|
||||
run_parser.add_argument('--config', required=True, help='Benchmark config file')
|
||||
run_parser = subparsers.add_parser("run", help="Run benchmark")
|
||||
run_parser.add_argument("--config", required=True, help="Benchmark config file")
|
||||
run_parser.add_argument(
|
||||
'--output-dir', '-o',
|
||||
default='benchmarks',
|
||||
help='Output directory (default: benchmarks)'
|
||||
"--output-dir", "-o", default="benchmarks", help="Output directory (default: benchmarks)"
|
||||
)
|
||||
|
||||
# Compare command
|
||||
compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks')
|
||||
compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark')
|
||||
compare_parser.add_argument('--current', required=True, help='Current benchmark')
|
||||
compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks")
|
||||
compare_parser.add_argument("--baseline", required=True, help="Baseline benchmark")
|
||||
compare_parser.add_argument("--current", required=True, help="Current benchmark")
|
||||
compare_parser.add_argument(
|
||||
'--fail-on-regression',
|
||||
action='store_true',
|
||||
help='Exit with error if regressions detected'
|
||||
"--fail-on-regression", action="store_true", help="Exit with error if regressions detected"
|
||||
)
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser('list', help='List saved benchmarks')
|
||||
list_parser = subparsers.add_parser("list", help="List saved benchmarks")
|
||||
list_parser.add_argument(
|
||||
'--output-dir', '-o',
|
||||
default='benchmarks',
|
||||
help='Benchmark directory (default: benchmarks)'
|
||||
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
|
||||
)
|
||||
|
||||
# Show command
|
||||
show_parser = subparsers.add_parser('show', help='Show benchmark details')
|
||||
show_parser.add_argument('path', help='Path to benchmark file')
|
||||
show_parser = subparsers.add_parser("show", help="Show benchmark details")
|
||||
show_parser.add_argument("path", help="Path to benchmark file")
|
||||
|
||||
# Cleanup command
|
||||
cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks')
|
||||
cleanup_parser = subparsers.add_parser("cleanup", help="Cleanup old benchmarks")
|
||||
cleanup_parser.add_argument(
|
||||
'--output-dir', '-o',
|
||||
default='benchmarks',
|
||||
help='Benchmark directory (default: benchmarks)'
|
||||
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
|
||||
)
|
||||
cleanup_parser.add_argument(
|
||||
'--keep',
|
||||
"--keep",
|
||||
type=int,
|
||||
default=5,
|
||||
help='Number of latest benchmarks to keep per name (default: 5)'
|
||||
help="Number of latest benchmarks to keep per name (default: 5)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -290,20 +281,20 @@ Examples:
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
if args.command == 'run':
|
||||
if args.command == "run":
|
||||
run_command(args)
|
||||
elif args.command == 'compare':
|
||||
elif args.command == "compare":
|
||||
compare_command(args)
|
||||
elif args.command == 'list':
|
||||
elif args.command == "list":
|
||||
list_command(args)
|
||||
elif args.command == 'show':
|
||||
elif args.command == "show":
|
||||
show_command(args)
|
||||
elif args.command == 'cleanup':
|
||||
elif args.command == "cleanup":
|
||||
cleanup_command(args)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -15,18 +15,13 @@ from .storage import get_storage_adaptor
|
||||
def upload_command(args):
|
||||
"""Handle upload subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
if Path(args.local_path).is_dir():
|
||||
print(f"📁 Uploading directory: {args.local_path}")
|
||||
uploaded_files = adaptor.upload_directory(
|
||||
args.local_path,
|
||||
args.remote_path,
|
||||
exclude_patterns=args.exclude
|
||||
args.local_path, args.remote_path, exclude_patterns=args.exclude
|
||||
)
|
||||
print(f"✅ Uploaded {len(uploaded_files)} files")
|
||||
if args.verbose:
|
||||
@@ -41,19 +36,13 @@ def upload_command(args):
|
||||
def download_command(args):
|
||||
"""Handle download subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
# Check if remote path is a directory (ends with /)
|
||||
if args.remote_path.endswith('/'):
|
||||
if args.remote_path.endswith("/"):
|
||||
print(f"📁 Downloading directory: {args.remote_path}")
|
||||
downloaded_files = adaptor.download_directory(
|
||||
args.remote_path,
|
||||
args.local_path
|
||||
)
|
||||
downloaded_files = adaptor.download_directory(args.remote_path, args.local_path)
|
||||
print(f"✅ Downloaded {len(downloaded_files)} files")
|
||||
if args.verbose:
|
||||
for file_path in downloaded_files:
|
||||
@@ -67,10 +56,7 @@ def download_command(args):
|
||||
def list_command(args):
|
||||
"""Handle list subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
print(f"📋 Listing files: {args.prefix or '(root)'}")
|
||||
@@ -99,15 +85,12 @@ def list_command(args):
|
||||
def delete_command(args):
|
||||
"""Handle delete subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
if not args.force:
|
||||
response = input(f"⚠️ Delete {args.remote_path}? [y/N]: ")
|
||||
if response.lower() != 'y':
|
||||
if response.lower() != "y":
|
||||
print("❌ Deletion cancelled")
|
||||
return
|
||||
|
||||
@@ -119,10 +102,7 @@ def delete_command(args):
|
||||
def url_command(args):
|
||||
"""Handle url subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
print(f"🔗 Generating signed URL: {args.remote_path}")
|
||||
@@ -134,10 +114,7 @@ def url_command(args):
|
||||
def copy_command(args):
|
||||
"""Handle copy subcommand."""
|
||||
adaptor = get_storage_adaptor(
|
||||
args.provider,
|
||||
bucket=args.bucket,
|
||||
container=args.container,
|
||||
**parse_extra_args(args.extra)
|
||||
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
|
||||
)
|
||||
|
||||
print(f"📋 Copying: {args.source_path} → {args.dest_path}")
|
||||
@@ -147,7 +124,7 @@ def copy_command(args):
|
||||
|
||||
def format_size(size_bytes: int) -> str:
|
||||
"""Format file size in human-readable format."""
|
||||
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
||||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||||
if size_bytes < 1024.0:
|
||||
return f"{size_bytes:.1f}{unit}"
|
||||
size_bytes /= 1024.0
|
||||
@@ -161,11 +138,11 @@ def parse_extra_args(extra: list | None) -> dict:
|
||||
|
||||
result = {}
|
||||
for arg in extra:
|
||||
if '=' in arg:
|
||||
key, value = arg.split('=', 1)
|
||||
result[key.lstrip('-')] = value
|
||||
if "=" in arg:
|
||||
key, value = arg.split("=", 1)
|
||||
result[key.lstrip("-")] = value
|
||||
else:
|
||||
result[arg.lstrip('-')] = True
|
||||
result[arg.lstrip("-")] = True
|
||||
|
||||
return result
|
||||
|
||||
@@ -173,7 +150,7 @@ def parse_extra_args(extra: list | None) -> dict:
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Cloud storage operations for Skill Seekers',
|
||||
description="Cloud storage operations for Skill Seekers",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -197,114 +174,66 @@ Provider-specific options:
|
||||
S3: --region=us-west-2 --endpoint-url=https://...
|
||||
GCS: --project=my-project --credentials-path=/path/to/creds.json
|
||||
Azure: --account-name=myaccount --account-key=...
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
# Global arguments
|
||||
parser.add_argument(
|
||||
'--provider',
|
||||
choices=['s3', 'gcs', 'azure'],
|
||||
required=True,
|
||||
help='Cloud storage provider'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--bucket',
|
||||
help='S3/GCS bucket name (for S3/GCS)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--container',
|
||||
help='Azure container name (for Azure)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Verbose output'
|
||||
"--provider", choices=["s3", "gcs", "azure"], required=True, help="Cloud storage provider"
|
||||
)
|
||||
parser.add_argument("--bucket", help="S3/GCS bucket name (for S3/GCS)")
|
||||
parser.add_argument("--container", help="Azure container name (for Azure)")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
|
||||
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
||||
|
||||
# Upload command
|
||||
upload_parser = subparsers.add_parser('upload', help='Upload file or directory')
|
||||
upload_parser.add_argument('local_path', help='Local file or directory path')
|
||||
upload_parser.add_argument('remote_path', help='Remote path in cloud storage')
|
||||
upload_parser = subparsers.add_parser("upload", help="Upload file or directory")
|
||||
upload_parser.add_argument("local_path", help="Local file or directory path")
|
||||
upload_parser.add_argument("remote_path", help="Remote path in cloud storage")
|
||||
upload_parser.add_argument(
|
||||
'--exclude',
|
||||
action='append',
|
||||
help='Glob patterns to exclude (for directories)'
|
||||
)
|
||||
upload_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
"--exclude", action="append", help="Glob patterns to exclude (for directories)"
|
||||
)
|
||||
upload_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
# Download command
|
||||
download_parser = subparsers.add_parser('download', help='Download file or directory')
|
||||
download_parser.add_argument('remote_path', help='Remote path in cloud storage')
|
||||
download_parser.add_argument('local_path', help='Local destination path')
|
||||
download_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
)
|
||||
download_parser = subparsers.add_parser("download", help="Download file or directory")
|
||||
download_parser.add_argument("remote_path", help="Remote path in cloud storage")
|
||||
download_parser.add_argument("local_path", help="Local destination path")
|
||||
download_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
# List command
|
||||
list_parser = subparsers.add_parser('list', help='List files in cloud storage')
|
||||
list_parser = subparsers.add_parser("list", help="List files in cloud storage")
|
||||
list_parser.add_argument("--prefix", default="", help="Prefix to filter files")
|
||||
list_parser.add_argument(
|
||||
'--prefix',
|
||||
default='',
|
||||
help='Prefix to filter files'
|
||||
)
|
||||
list_parser.add_argument(
|
||||
'--max-results',
|
||||
type=int,
|
||||
default=1000,
|
||||
help='Maximum number of results'
|
||||
)
|
||||
list_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
"--max-results", type=int, default=1000, help="Maximum number of results"
|
||||
)
|
||||
list_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
# Delete command
|
||||
delete_parser = subparsers.add_parser('delete', help='Delete file from cloud storage')
|
||||
delete_parser.add_argument('remote_path', help='Remote path in cloud storage')
|
||||
delete_parser = subparsers.add_parser("delete", help="Delete file from cloud storage")
|
||||
delete_parser.add_argument("remote_path", help="Remote path in cloud storage")
|
||||
delete_parser.add_argument(
|
||||
'--force', '-f',
|
||||
action='store_true',
|
||||
help='Skip confirmation prompt'
|
||||
)
|
||||
delete_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
"--force", "-f", action="store_true", help="Skip confirmation prompt"
|
||||
)
|
||||
delete_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
# URL command
|
||||
url_parser = subparsers.add_parser('url', help='Generate signed URL')
|
||||
url_parser.add_argument('remote_path', help='Remote path in cloud storage')
|
||||
url_parser = subparsers.add_parser("url", help="Generate signed URL")
|
||||
url_parser.add_argument("remote_path", help="Remote path in cloud storage")
|
||||
url_parser.add_argument(
|
||||
'--expires-in',
|
||||
"--expires-in",
|
||||
type=int,
|
||||
default=3600,
|
||||
help='URL expiration time in seconds (default: 3600)'
|
||||
)
|
||||
url_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
help="URL expiration time in seconds (default: 3600)",
|
||||
)
|
||||
url_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
# Copy command
|
||||
copy_parser = subparsers.add_parser('copy', help='Copy file within cloud storage')
|
||||
copy_parser.add_argument('source_path', help='Source path')
|
||||
copy_parser.add_argument('dest_path', help='Destination path')
|
||||
copy_parser.add_argument(
|
||||
'extra',
|
||||
nargs='*',
|
||||
help='Provider-specific options (--key=value)'
|
||||
)
|
||||
copy_parser = subparsers.add_parser("copy", help="Copy file within cloud storage")
|
||||
copy_parser.add_argument("source_path", help="Source path")
|
||||
copy_parser.add_argument("dest_path", help="Destination path")
|
||||
copy_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -313,26 +242,26 @@ Provider-specific options:
|
||||
sys.exit(1)
|
||||
|
||||
# Validate bucket/container based on provider
|
||||
if args.provider in ['s3', 'gcs'] and not args.bucket:
|
||||
if args.provider in ["s3", "gcs"] and not args.bucket:
|
||||
print(f"❌ Error: --bucket is required for {args.provider.upper()}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
elif args.provider == 'azure' and not args.container:
|
||||
elif args.provider == "azure" and not args.container:
|
||||
print("❌ Error: --container is required for Azure", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Execute command
|
||||
if args.command == 'upload':
|
||||
if args.command == "upload":
|
||||
upload_command(args)
|
||||
elif args.command == 'download':
|
||||
elif args.command == "download":
|
||||
download_command(args)
|
||||
elif args.command == 'list':
|
||||
elif args.command == "list":
|
||||
list_command(args)
|
||||
elif args.command == 'delete':
|
||||
elif args.command == "delete":
|
||||
delete_command(args)
|
||||
elif args.command == 'url':
|
||||
elif args.command == "url":
|
||||
url_command(args)
|
||||
elif args.command == 'copy':
|
||||
elif args.command == "copy":
|
||||
copy_command(args)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
@@ -342,9 +271,10 @@ Provider-specific options:
|
||||
print(f"❌ Error: {e}", file=sys.stderr)
|
||||
if args.verbose:
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -376,8 +376,8 @@ class CodeAnalyzer:
|
||||
for match in re.finditer(pattern, content):
|
||||
module = match.group(1)
|
||||
# Extract package name (before first /)
|
||||
package = module.split('/')[0]
|
||||
if package and not package.startswith('.'): # Skip relative imports
|
||||
package = module.split("/")[0]
|
||||
if package and not package.startswith("."): # Skip relative imports
|
||||
imports.append(package)
|
||||
|
||||
return {
|
||||
@@ -694,11 +694,11 @@ class CodeAnalyzer:
|
||||
for match in re.finditer(using_pattern, content):
|
||||
namespace = match.group(1).strip()
|
||||
# Skip using aliases (using Foo = Bar.Baz)
|
||||
if '=' not in namespace:
|
||||
if "=" not in namespace:
|
||||
# Extract base namespace (first 1-2 segments)
|
||||
parts = namespace.split('.')
|
||||
parts = namespace.split(".")
|
||||
if len(parts) >= 2:
|
||||
base_ns = '.'.join(parts[:2])
|
||||
base_ns = ".".join(parts[:2])
|
||||
imports.append(base_ns)
|
||||
elif len(parts) == 1:
|
||||
imports.append(parts[0])
|
||||
@@ -1130,10 +1130,10 @@ class CodeAnalyzer:
|
||||
for match in re.finditer(import_pattern, content):
|
||||
import_path = match.group(1).strip()
|
||||
# Extract package name (first 2-3 segments for framework detection)
|
||||
parts = import_path.split('.')
|
||||
parts = import_path.split(".")
|
||||
if len(parts) >= 2:
|
||||
# Get base package (e.g., "org.springframework" from "org.springframework.boot.SpringApplication")
|
||||
package = '.'.join(parts[:2])
|
||||
package = ".".join(parts[:2])
|
||||
imports.append(package)
|
||||
|
||||
return {
|
||||
@@ -1303,7 +1303,7 @@ class CodeAnalyzer:
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
# Extract gem name (before first /)
|
||||
gem = module.split('/')[0]
|
||||
gem = module.split("/")[0]
|
||||
imports.append(gem)
|
||||
|
||||
return {
|
||||
@@ -1443,7 +1443,7 @@ class CodeAnalyzer:
|
||||
for match in re.finditer(use_pattern, content):
|
||||
namespace = match.group(1).strip()
|
||||
# Extract vendor name (first segment)
|
||||
parts = namespace.split('\\')
|
||||
parts = namespace.split("\\")
|
||||
if parts:
|
||||
vendor = parts[0]
|
||||
imports.append(vendor.lower())
|
||||
|
||||
@@ -1036,11 +1036,15 @@ def analyze_codebase(
|
||||
# Save summary statistics
|
||||
summary_json = pattern_output / "summary.json"
|
||||
with open(summary_json, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"statistics": stats,
|
||||
"thresholds": multi_level["thresholds"],
|
||||
"files_analyzed": len(pattern_results),
|
||||
}, f, indent=2)
|
||||
json.dump(
|
||||
{
|
||||
"statistics": stats,
|
||||
"thresholds": multi_level["thresholds"],
|
||||
"files_analyzed": len(pattern_results),
|
||||
},
|
||||
f,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
# Log results with breakdown by confidence
|
||||
logger.info(f"✅ Detected {stats['total']} patterns in {len(pattern_results)} files")
|
||||
@@ -1931,21 +1935,15 @@ def _check_deprecated_flags(args):
|
||||
"⚠️ DEPRECATED: --ai-mode local → use --enhance-level without API key instead"
|
||||
)
|
||||
elif args.ai_mode == "none":
|
||||
warnings.append(
|
||||
"⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead"
|
||||
)
|
||||
warnings.append("⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead")
|
||||
|
||||
# Deprecated: --quick flag
|
||||
if hasattr(args, "quick") and args.quick:
|
||||
warnings.append(
|
||||
"⚠️ DEPRECATED: --quick → use --preset quick instead"
|
||||
)
|
||||
warnings.append("⚠️ DEPRECATED: --quick → use --preset quick instead")
|
||||
|
||||
# Deprecated: --comprehensive flag
|
||||
if hasattr(args, "comprehensive") and args.comprehensive:
|
||||
warnings.append(
|
||||
"⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead"
|
||||
)
|
||||
warnings.append("⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead")
|
||||
|
||||
# Show warnings if any found
|
||||
if warnings:
|
||||
@@ -2000,24 +1998,22 @@ Examples:
|
||||
parser.add_argument(
|
||||
"--preset",
|
||||
choices=["quick", "standard", "comprehensive"],
|
||||
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
|
||||
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preset-list",
|
||||
action="store_true",
|
||||
help="Show available presets and exit"
|
||||
"--preset-list", action="store_true", help="Show available presets and exit"
|
||||
)
|
||||
|
||||
# Legacy preset flags (kept for backward compatibility)
|
||||
parser.add_argument(
|
||||
"--quick",
|
||||
action="store_true",
|
||||
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
|
||||
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comprehensive",
|
||||
action="store_true",
|
||||
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead"
|
||||
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -2129,6 +2125,7 @@ Examples:
|
||||
# Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation
|
||||
if "--preset-list" in sys.argv:
|
||||
from skill_seekers.cli.presets import PresetManager
|
||||
|
||||
print(PresetManager.format_preset_help())
|
||||
return 0
|
||||
|
||||
@@ -2155,6 +2152,7 @@ Examples:
|
||||
# Apply preset using PresetManager
|
||||
if preset_name:
|
||||
from skill_seekers.cli.presets import PresetManager
|
||||
|
||||
try:
|
||||
preset_args = PresetManager.apply_preset(preset_name, vars(args))
|
||||
# Update args with preset values
|
||||
@@ -2162,9 +2160,7 @@ Examples:
|
||||
setattr(args, key, value)
|
||||
|
||||
preset = PresetManager.get_preset(preset_name)
|
||||
logger.info(
|
||||
f"{preset.icon} {preset.name} analysis mode: {preset.description}"
|
||||
)
|
||||
logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}")
|
||||
except ValueError as e:
|
||||
logger.error(f"❌ {e}")
|
||||
return 1
|
||||
|
||||
@@ -19,6 +19,7 @@ import numpy as np
|
||||
@dataclass
|
||||
class EmbeddingConfig:
|
||||
"""Configuration for embedding generation."""
|
||||
|
||||
provider: str # 'openai', 'cohere', 'huggingface', 'local'
|
||||
model: str
|
||||
dimension: int
|
||||
@@ -31,6 +32,7 @@ class EmbeddingConfig:
|
||||
@dataclass
|
||||
class EmbeddingResult:
|
||||
"""Result of embedding generation."""
|
||||
|
||||
embeddings: list[list[float]]
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
cached_count: int = 0
|
||||
@@ -42,6 +44,7 @@ class EmbeddingResult:
|
||||
@dataclass
|
||||
class CostTracker:
|
||||
"""Track embedding generation costs."""
|
||||
|
||||
total_tokens: int = 0
|
||||
total_requests: int = 0
|
||||
cache_hits: int = 0
|
||||
@@ -64,12 +67,12 @@ class CostTracker:
|
||||
cache_rate = (self.cache_hits / self.total_requests * 100) if self.total_requests > 0 else 0
|
||||
|
||||
return {
|
||||
'total_requests': self.total_requests,
|
||||
'total_tokens': self.total_tokens,
|
||||
'cache_hits': self.cache_hits,
|
||||
'cache_misses': self.cache_misses,
|
||||
'cache_rate': f"{cache_rate:.1f}%",
|
||||
'estimated_cost': f"${self.estimated_cost:.4f}"
|
||||
"total_requests": self.total_requests,
|
||||
"total_tokens": self.total_tokens,
|
||||
"cache_hits": self.cache_hits,
|
||||
"cache_misses": self.cache_misses,
|
||||
"cache_rate": f"{cache_rate:.1f}%",
|
||||
"estimated_cost": f"${self.estimated_cost:.4f}",
|
||||
}
|
||||
|
||||
|
||||
@@ -97,18 +100,18 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
||||
|
||||
# Pricing per 1M tokens (as of 2026)
|
||||
PRICING = {
|
||||
'text-embedding-ada-002': 0.10,
|
||||
'text-embedding-3-small': 0.02,
|
||||
'text-embedding-3-large': 0.13,
|
||||
"text-embedding-ada-002": 0.10,
|
||||
"text-embedding-3-small": 0.02,
|
||||
"text-embedding-3-large": 0.13,
|
||||
}
|
||||
|
||||
DIMENSIONS = {
|
||||
'text-embedding-ada-002': 1536,
|
||||
'text-embedding-3-small': 1536,
|
||||
'text-embedding-3-large': 3072,
|
||||
"text-embedding-ada-002": 1536,
|
||||
"text-embedding-3-small": 1536,
|
||||
"text-embedding-3-large": 3072,
|
||||
}
|
||||
|
||||
def __init__(self, model: str = 'text-embedding-ada-002', api_key: str | None = None):
|
||||
def __init__(self, model: str = "text-embedding-ada-002", api_key: str | None = None):
|
||||
"""Initialize OpenAI provider."""
|
||||
self.model = model
|
||||
self.api_key = api_key
|
||||
@@ -119,9 +122,12 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
||||
if self._client is None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
self._client = OpenAI(api_key=self.api_key)
|
||||
except ImportError:
|
||||
raise ImportError("OpenAI package not installed. Install with: pip install openai") from None
|
||||
raise ImportError(
|
||||
"OpenAI package not installed. Install with: pip install openai"
|
||||
) from None
|
||||
return self._client
|
||||
|
||||
def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
|
||||
@@ -130,10 +136,7 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
|
||||
|
||||
embeddings = []
|
||||
for text in texts:
|
||||
response = client.embeddings.create(
|
||||
model=self.model,
|
||||
input=text
|
||||
)
|
||||
response = client.embeddings.create(model=self.model, input=text)
|
||||
embeddings.append(response.data[0].embedding)
|
||||
|
||||
return embeddings
|
||||
@@ -207,7 +210,7 @@ class EmbeddingCache:
|
||||
if cache_file.exists():
|
||||
try:
|
||||
data = json.loads(cache_file.read_text())
|
||||
embedding = data['embedding']
|
||||
embedding = data["embedding"]
|
||||
self._memory_cache[cache_key] = embedding
|
||||
return embedding
|
||||
except Exception:
|
||||
@@ -226,12 +229,16 @@ class EmbeddingCache:
|
||||
if self.cache_dir:
|
||||
cache_file = self.cache_dir / f"{cache_key}.json"
|
||||
try:
|
||||
cache_file.write_text(json.dumps({
|
||||
'text_hash': cache_key,
|
||||
'model': model,
|
||||
'embedding': embedding,
|
||||
'timestamp': time.time()
|
||||
}))
|
||||
cache_file.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"text_hash": cache_key,
|
||||
"model": model,
|
||||
"embedding": embedding,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Failed to write cache: {e}")
|
||||
|
||||
@@ -252,9 +259,9 @@ class EmbeddingPipeline:
|
||||
|
||||
def _create_provider(self) -> EmbeddingProvider:
|
||||
"""Create provider based on config."""
|
||||
if self.config.provider == 'openai':
|
||||
if self.config.provider == "openai":
|
||||
return OpenAIEmbeddingProvider(self.config.model)
|
||||
elif self.config.provider == 'local':
|
||||
elif self.config.provider == "local":
|
||||
return LocalEmbeddingProvider(self.config.dimension)
|
||||
else:
|
||||
raise ValueError(f"Unknown provider: {self.config.provider}")
|
||||
@@ -264,11 +271,7 @@ class EmbeddingPipeline:
|
||||
# Rough estimate: 1 token ≈ 4 characters
|
||||
return len(text) // 4
|
||||
|
||||
def generate_batch(
|
||||
self,
|
||||
texts: list[str],
|
||||
show_progress: bool = True
|
||||
) -> EmbeddingResult:
|
||||
def generate_batch(self, texts: list[str], show_progress: bool = True) -> EmbeddingResult:
|
||||
"""
|
||||
Generate embeddings for batch of texts.
|
||||
|
||||
@@ -293,7 +296,7 @@ class EmbeddingPipeline:
|
||||
|
||||
# Process in batches
|
||||
for i in range(0, len(texts), self.config.batch_size):
|
||||
batch = texts[i:i + self.config.batch_size]
|
||||
batch = texts[i : i + self.config.batch_size]
|
||||
batch_embeddings = []
|
||||
to_generate = []
|
||||
to_generate_indices = []
|
||||
@@ -331,7 +334,7 @@ class EmbeddingPipeline:
|
||||
|
||||
if show_progress and len(texts) > self.config.batch_size:
|
||||
progress = min(i + self.config.batch_size, len(texts))
|
||||
print(f" Progress: {progress}/{len(texts)} ({progress/len(texts)*100:.1f}%)")
|
||||
print(f" Progress: {progress}/{len(texts)} ({progress / len(texts) * 100:.1f}%)")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
|
||||
@@ -342,21 +345,21 @@ class EmbeddingPipeline:
|
||||
print(f" Generated: {generated_count}")
|
||||
print(f" Time: {total_time:.2f}s")
|
||||
|
||||
if self.config.provider != 'local':
|
||||
if self.config.provider != "local":
|
||||
stats = self.cost_tracker.get_stats()
|
||||
print(f" Cost: {stats['estimated_cost']}")
|
||||
|
||||
return EmbeddingResult(
|
||||
embeddings=embeddings,
|
||||
metadata={
|
||||
'provider': self.config.provider,
|
||||
'model': self.config.model,
|
||||
'dimension': self.provider.get_dimension()
|
||||
"provider": self.config.provider,
|
||||
"model": self.config.model,
|
||||
"dimension": self.provider.get_dimension(),
|
||||
},
|
||||
cached_count=cached_count,
|
||||
generated_count=generated_count,
|
||||
total_time=total_time,
|
||||
cost_estimate=self.cost_tracker.estimated_cost
|
||||
cost_estimate=self.cost_tracker.estimated_cost,
|
||||
)
|
||||
|
||||
def validate_dimensions(self, embeddings: list[list[float]]) -> bool:
|
||||
@@ -373,8 +376,10 @@ class EmbeddingPipeline:
|
||||
|
||||
for i, embedding in enumerate(embeddings):
|
||||
if len(embedding) != expected_dim:
|
||||
print(f"❌ Dimension mismatch at index {i}: "
|
||||
f"expected {expected_dim}, got {len(embedding)}")
|
||||
print(
|
||||
f"❌ Dimension mismatch at index {i}: "
|
||||
f"expected {expected_dim}, got {len(embedding)}"
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -390,11 +395,11 @@ def example_usage():
|
||||
|
||||
# Configure pipeline
|
||||
config = EmbeddingConfig(
|
||||
provider='local', # Use 'openai' for production
|
||||
model='text-embedding-ada-002',
|
||||
provider="local", # Use 'openai' for production
|
||||
model="text-embedding-ada-002",
|
||||
dimension=384,
|
||||
batch_size=50,
|
||||
cache_dir=Path("output/.embeddings_cache")
|
||||
cache_dir=Path("output/.embeddings_cache"),
|
||||
)
|
||||
|
||||
# Initialize pipeline
|
||||
|
||||
@@ -175,8 +175,7 @@ class LocalSkillEnhancer:
|
||||
dangerous_chars = [";", "&", "|", "$", "`", "\n", "\r"]
|
||||
if any(char in cmd_template for char in dangerous_chars):
|
||||
raise ValueError(
|
||||
"Custom command contains dangerous shell characters. "
|
||||
f"Command: {cmd_template}"
|
||||
f"Custom command contains dangerous shell characters. Command: {cmd_template}"
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -888,9 +887,7 @@ rm {prompt_file}
|
||||
print("❌ SKILL.md not found after enhancement")
|
||||
return False
|
||||
else:
|
||||
print(
|
||||
f"❌ {self.agent_display} returned error (exit code: {result.returncode})"
|
||||
)
|
||||
print(f"❌ {self.agent_display} returned error (exit code: {result.returncode})")
|
||||
if result.stderr:
|
||||
print(f" Error: {result.stderr[:200]}")
|
||||
return False
|
||||
|
||||
@@ -16,6 +16,7 @@ from datetime import datetime
|
||||
@dataclass
|
||||
class DocumentVersion:
|
||||
"""Version information for a document."""
|
||||
|
||||
file_path: str
|
||||
content_hash: str
|
||||
size_bytes: int
|
||||
@@ -26,6 +27,7 @@ class DocumentVersion:
|
||||
@dataclass
|
||||
class ChangeSet:
|
||||
"""Set of changes detected."""
|
||||
|
||||
added: list[DocumentVersion]
|
||||
modified: list[DocumentVersion]
|
||||
deleted: list[str]
|
||||
@@ -45,6 +47,7 @@ class ChangeSet:
|
||||
@dataclass
|
||||
class UpdateMetadata:
|
||||
"""Metadata for an incremental update."""
|
||||
|
||||
timestamp: str
|
||||
previous_version: str
|
||||
new_version: str
|
||||
@@ -86,7 +89,7 @@ class IncrementalUpdater:
|
||||
sha256 = hashlib.sha256()
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
with open(file_path, "rb") as f:
|
||||
while chunk := f.read(8192):
|
||||
sha256.update(chunk)
|
||||
return sha256.hexdigest()
|
||||
@@ -111,7 +114,7 @@ class IncrementalUpdater:
|
||||
content_hash=self._compute_file_hash(skill_md),
|
||||
size_bytes=skill_md.stat().st_size,
|
||||
last_modified=skill_md.stat().st_mtime,
|
||||
version=1
|
||||
version=1,
|
||||
)
|
||||
|
||||
# Scan references
|
||||
@@ -125,7 +128,7 @@ class IncrementalUpdater:
|
||||
content_hash=self._compute_file_hash(ref_file),
|
||||
size_bytes=ref_file.stat().st_size,
|
||||
last_modified=ref_file.stat().st_mtime,
|
||||
version=1
|
||||
version=1,
|
||||
)
|
||||
|
||||
return versions
|
||||
@@ -157,9 +160,8 @@ class IncrementalUpdater:
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"version": "1.0.0",
|
||||
"documents": {
|
||||
file_path: asdict(version)
|
||||
for file_path, version in self.current_versions.items()
|
||||
}
|
||||
file_path: asdict(version) for file_path, version in self.current_versions.items()
|
||||
},
|
||||
}
|
||||
|
||||
self.version_file.write_text(json.dumps(data, indent=2))
|
||||
@@ -180,10 +182,7 @@ class IncrementalUpdater:
|
||||
if not has_previous:
|
||||
# First time - all files are "added"
|
||||
return ChangeSet(
|
||||
added=list(self.current_versions.values()),
|
||||
modified=[],
|
||||
deleted=[],
|
||||
unchanged=[]
|
||||
added=list(self.current_versions.values()), modified=[], deleted=[], unchanged=[]
|
||||
)
|
||||
|
||||
# Detect changes
|
||||
@@ -215,18 +214,10 @@ class IncrementalUpdater:
|
||||
else:
|
||||
unchanged.append(current)
|
||||
|
||||
return ChangeSet(
|
||||
added=added,
|
||||
modified=modified,
|
||||
deleted=deleted,
|
||||
unchanged=unchanged
|
||||
)
|
||||
return ChangeSet(added=added, modified=modified, deleted=deleted, unchanged=unchanged)
|
||||
|
||||
def generate_update_package(
|
||||
self,
|
||||
change_set: ChangeSet,
|
||||
output_path: Path,
|
||||
include_content: bool = True
|
||||
self, change_set: ChangeSet, output_path: Path, include_content: bool = True
|
||||
) -> Path:
|
||||
"""
|
||||
Generate incremental update package.
|
||||
@@ -250,11 +241,11 @@ class IncrementalUpdater:
|
||||
"added": len(change_set.added),
|
||||
"modified": len(change_set.modified),
|
||||
"deleted": len(change_set.deleted),
|
||||
"unchanged": len(change_set.unchanged)
|
||||
"unchanged": len(change_set.unchanged),
|
||||
},
|
||||
"total_changes": change_set.total_changes
|
||||
"total_changes": change_set.total_changes,
|
||||
},
|
||||
"changes": {}
|
||||
"changes": {},
|
||||
}
|
||||
|
||||
# Include changed documents
|
||||
@@ -267,7 +258,7 @@ class IncrementalUpdater:
|
||||
"version": doc.version,
|
||||
"content": file_path.read_text(encoding="utf-8"),
|
||||
"hash": doc.content_hash,
|
||||
"size": doc.size_bytes
|
||||
"size": doc.size_bytes,
|
||||
}
|
||||
|
||||
# Modified documents
|
||||
@@ -278,14 +269,12 @@ class IncrementalUpdater:
|
||||
"version": doc.version,
|
||||
"content": file_path.read_text(encoding="utf-8"),
|
||||
"hash": doc.content_hash,
|
||||
"size": doc.size_bytes
|
||||
"size": doc.size_bytes,
|
||||
}
|
||||
|
||||
# Deleted documents
|
||||
for file_path in change_set.deleted:
|
||||
update_data["changes"][file_path] = {
|
||||
"action": "delete"
|
||||
}
|
||||
update_data["changes"][file_path] = {"action": "delete"}
|
||||
|
||||
# Write package
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -332,7 +321,9 @@ class IncrementalUpdater:
|
||||
if prev:
|
||||
size_diff = doc.size_bytes - prev.size_bytes
|
||||
size_str = f"{size_diff:+,} bytes" if size_diff != 0 else "same size"
|
||||
lines.append(f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})")
|
||||
lines.append(
|
||||
f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})"
|
||||
)
|
||||
else:
|
||||
lines.append(f" ~ {doc.file_path} (v{doc.version})")
|
||||
lines.append("")
|
||||
@@ -473,4 +464,5 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
@@ -369,8 +369,6 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
|
||||
(r"\$[0-9]+", 4),
|
||||
(r"->", 3),
|
||||
],
|
||||
|
||||
|
||||
# ===== Markup/Config Languages =====
|
||||
"html": [
|
||||
(r"<!DOCTYPE\s+html>", 5),
|
||||
|
||||
@@ -42,25 +42,25 @@ from skill_seekers.cli import __version__
|
||||
|
||||
# Command module mapping (command name -> module path)
|
||||
COMMAND_MODULES = {
|
||||
'config': 'skill_seekers.cli.config_command',
|
||||
'scrape': 'skill_seekers.cli.doc_scraper',
|
||||
'github': 'skill_seekers.cli.github_scraper',
|
||||
'pdf': 'skill_seekers.cli.pdf_scraper',
|
||||
'unified': 'skill_seekers.cli.unified_scraper',
|
||||
'enhance': 'skill_seekers.cli.enhance_skill_local',
|
||||
'enhance-status': 'skill_seekers.cli.enhance_status',
|
||||
'package': 'skill_seekers.cli.package_skill',
|
||||
'upload': 'skill_seekers.cli.upload_skill',
|
||||
'estimate': 'skill_seekers.cli.estimate_pages',
|
||||
'extract-test-examples': 'skill_seekers.cli.test_example_extractor',
|
||||
'install-agent': 'skill_seekers.cli.install_agent',
|
||||
'analyze': 'skill_seekers.cli.codebase_scraper',
|
||||
'install': 'skill_seekers.cli.install_skill',
|
||||
'resume': 'skill_seekers.cli.resume_command',
|
||||
'stream': 'skill_seekers.cli.streaming_ingest',
|
||||
'update': 'skill_seekers.cli.incremental_updater',
|
||||
'multilang': 'skill_seekers.cli.multilang_support',
|
||||
'quality': 'skill_seekers.cli.quality_metrics',
|
||||
"config": "skill_seekers.cli.config_command",
|
||||
"scrape": "skill_seekers.cli.doc_scraper",
|
||||
"github": "skill_seekers.cli.github_scraper",
|
||||
"pdf": "skill_seekers.cli.pdf_scraper",
|
||||
"unified": "skill_seekers.cli.unified_scraper",
|
||||
"enhance": "skill_seekers.cli.enhance_skill_local",
|
||||
"enhance-status": "skill_seekers.cli.enhance_status",
|
||||
"package": "skill_seekers.cli.package_skill",
|
||||
"upload": "skill_seekers.cli.upload_skill",
|
||||
"estimate": "skill_seekers.cli.estimate_pages",
|
||||
"extract-test-examples": "skill_seekers.cli.test_example_extractor",
|
||||
"install-agent": "skill_seekers.cli.install_agent",
|
||||
"analyze": "skill_seekers.cli.codebase_scraper",
|
||||
"install": "skill_seekers.cli.install_skill",
|
||||
"resume": "skill_seekers.cli.resume_command",
|
||||
"stream": "skill_seekers.cli.streaming_ingest",
|
||||
"update": "skill_seekers.cli.incremental_updater",
|
||||
"multilang": "skill_seekers.cli.multilang_support",
|
||||
"quality": "skill_seekers.cli.quality_metrics",
|
||||
}
|
||||
|
||||
|
||||
@@ -124,12 +124,21 @@ def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]:
|
||||
|
||||
# Convert args to sys.argv format
|
||||
for key, value in vars(args).items():
|
||||
if key == 'command':
|
||||
if key == "command":
|
||||
continue
|
||||
|
||||
# Handle positional arguments (no -- prefix)
|
||||
if key in ['url', 'directory', 'file', 'job_id', 'skill_directory', 'zip_file', 'config', 'input_file']:
|
||||
if value is not None and value != '':
|
||||
if key in [
|
||||
"url",
|
||||
"directory",
|
||||
"file",
|
||||
"job_id",
|
||||
"skill_directory",
|
||||
"zip_file",
|
||||
"config",
|
||||
"input_file",
|
||||
]:
|
||||
if value is not None and value != "":
|
||||
argv.append(str(value))
|
||||
continue
|
||||
|
||||
@@ -172,7 +181,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
return 1
|
||||
|
||||
# Special handling for 'analyze' command (has post-processing)
|
||||
if args.command == 'analyze':
|
||||
if args.command == "analyze":
|
||||
return _handle_analyze_command(args)
|
||||
|
||||
# Standard delegation for all other commands
|
||||
@@ -200,6 +209,7 @@ def main(argv: list[str] | None = None) -> int:
|
||||
|
||||
# Show traceback in verbose mode
|
||||
import traceback
|
||||
|
||||
if hasattr(args, "verbose") and getattr(args, "verbose", False):
|
||||
traceback.print_exc()
|
||||
|
||||
@@ -226,13 +236,16 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
|
||||
|
||||
# Handle preset flags (depth and features)
|
||||
if args.quick:
|
||||
sys.argv.extend([
|
||||
"--depth", "surface",
|
||||
"--skip-patterns",
|
||||
"--skip-test-examples",
|
||||
"--skip-how-to-guides",
|
||||
"--skip-config-patterns",
|
||||
])
|
||||
sys.argv.extend(
|
||||
[
|
||||
"--depth",
|
||||
"surface",
|
||||
"--skip-patterns",
|
||||
"--skip-test-examples",
|
||||
"--skip-how-to-guides",
|
||||
"--skip-config-patterns",
|
||||
]
|
||||
)
|
||||
elif args.comprehensive:
|
||||
sys.argv.extend(["--depth", "full"])
|
||||
elif args.depth:
|
||||
@@ -246,6 +259,7 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
|
||||
elif args.enhance:
|
||||
try:
|
||||
from skill_seekers.cli.config_manager import get_config_manager
|
||||
|
||||
config = get_config_manager()
|
||||
enhance_level = config.get_default_enhance_level()
|
||||
except Exception:
|
||||
|
||||
@@ -15,6 +15,7 @@ import json
|
||||
@dataclass
|
||||
class LanguageInfo:
|
||||
"""Language information for a document."""
|
||||
|
||||
code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh')
|
||||
name: str # Full name (e.g., 'English', 'Spanish', 'Chinese')
|
||||
confidence: float # Detection confidence (0.0-1.0)
|
||||
@@ -24,6 +25,7 @@ class LanguageInfo:
|
||||
@dataclass
|
||||
class TranslationStatus:
|
||||
"""Translation status for a document."""
|
||||
|
||||
source_language: str
|
||||
target_languages: list[str]
|
||||
translated_languages: set[str]
|
||||
@@ -40,74 +42,81 @@ class LanguageDetector:
|
||||
|
||||
# Common word patterns by language
|
||||
LANGUAGE_PATTERNS = {
|
||||
'en': [
|
||||
r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b',
|
||||
r'\b(this|that|these|those|what|which|who|where|when)\b',
|
||||
"en": [
|
||||
r"\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b",
|
||||
r"\b(this|that|these|those|what|which|who|where|when)\b",
|
||||
],
|
||||
'es': [
|
||||
r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b',
|
||||
r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b',
|
||||
"es": [
|
||||
r"\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b",
|
||||
r"\b(que|no|un|una|como|más|pero|muy|todo|ya)\b",
|
||||
],
|
||||
'fr': [
|
||||
r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b',
|
||||
r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b',
|
||||
"fr": [
|
||||
r"\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b",
|
||||
r"\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b",
|
||||
],
|
||||
'de': [
|
||||
r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b',
|
||||
r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b',
|
||||
"de": [
|
||||
r"\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b",
|
||||
r"\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b",
|
||||
],
|
||||
'zh': [
|
||||
r'[\u4e00-\u9fff]', # Chinese characters
|
||||
r'(的|了|和|是|在|有|我|他|不|这)',
|
||||
"zh": [
|
||||
r"[\u4e00-\u9fff]", # Chinese characters
|
||||
r"(的|了|和|是|在|有|我|他|不|这)",
|
||||
],
|
||||
'ja': [
|
||||
r'[\u3040-\u309f]', # Hiragana
|
||||
r'[\u30a0-\u30ff]', # Katakana
|
||||
r'[\u4e00-\u9faf]', # Kanji
|
||||
"ja": [
|
||||
r"[\u3040-\u309f]", # Hiragana
|
||||
r"[\u30a0-\u30ff]", # Katakana
|
||||
r"[\u4e00-\u9faf]", # Kanji
|
||||
],
|
||||
'ko': [
|
||||
r'[\uac00-\ud7af]', # Hangul
|
||||
r'(의|가|이|은|들|는|좀|잘|께|을)',
|
||||
"ko": [
|
||||
r"[\uac00-\ud7af]", # Hangul
|
||||
r"(의|가|이|은|들|는|좀|잘|께|을)",
|
||||
],
|
||||
'ru': [
|
||||
r'[\u0400-\u04ff]', # Cyrillic
|
||||
r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b',
|
||||
"ru": [
|
||||
r"[\u0400-\u04ff]", # Cyrillic
|
||||
r"\b(и|в|не|на|с|что|он|по|а|как|это|все)\b",
|
||||
],
|
||||
'pt': [
|
||||
r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b',
|
||||
r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b',
|
||||
"pt": [
|
||||
r"\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b",
|
||||
r"\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b",
|
||||
],
|
||||
'it': [
|
||||
r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b',
|
||||
r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b',
|
||||
"it": [
|
||||
r"\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b",
|
||||
r"\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b",
|
||||
],
|
||||
'ar': [
|
||||
r'[\u0600-\u06ff]', # Arabic
|
||||
r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)',
|
||||
"ar": [
|
||||
r"[\u0600-\u06ff]", # Arabic
|
||||
r"(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)",
|
||||
],
|
||||
}
|
||||
|
||||
# Language names
|
||||
LANGUAGE_NAMES = {
|
||||
'en': 'English',
|
||||
'es': 'Spanish',
|
||||
'fr': 'French',
|
||||
'de': 'German',
|
||||
'zh': 'Chinese',
|
||||
'ja': 'Japanese',
|
||||
'ko': 'Korean',
|
||||
'ru': 'Russian',
|
||||
'pt': 'Portuguese',
|
||||
'it': 'Italian',
|
||||
'ar': 'Arabic',
|
||||
"en": "English",
|
||||
"es": "Spanish",
|
||||
"fr": "French",
|
||||
"de": "German",
|
||||
"zh": "Chinese",
|
||||
"ja": "Japanese",
|
||||
"ko": "Korean",
|
||||
"ru": "Russian",
|
||||
"pt": "Portuguese",
|
||||
"it": "Italian",
|
||||
"ar": "Arabic",
|
||||
}
|
||||
|
||||
# Script types
|
||||
SCRIPTS = {
|
||||
'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin',
|
||||
'pt': 'Latin', 'it': 'Latin',
|
||||
'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul',
|
||||
'ru': 'Cyrillic', 'ar': 'Arabic',
|
||||
"en": "Latin",
|
||||
"es": "Latin",
|
||||
"fr": "Latin",
|
||||
"de": "Latin",
|
||||
"pt": "Latin",
|
||||
"it": "Latin",
|
||||
"zh": "Han",
|
||||
"ja": "Japanese",
|
||||
"ko": "Hangul",
|
||||
"ru": "Cyrillic",
|
||||
"ar": "Arabic",
|
||||
}
|
||||
|
||||
def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo:
|
||||
@@ -122,7 +131,7 @@ class LanguageDetector:
|
||||
LanguageInfo with detected language
|
||||
"""
|
||||
if not text.strip():
|
||||
return LanguageInfo('en', 'English', 0.0)
|
||||
return LanguageInfo("en", "English", 0.0)
|
||||
|
||||
# Sample text for efficiency
|
||||
sample = text[:sample_size].lower()
|
||||
@@ -140,7 +149,7 @@ class LanguageDetector:
|
||||
# Find best match
|
||||
if not scores or max(scores.values()) == 0:
|
||||
# Default to English
|
||||
return LanguageInfo('en', 'English', 0.1)
|
||||
return LanguageInfo("en", "English", 0.1)
|
||||
|
||||
best_lang = max(scores, key=scores.get)
|
||||
total_score = sum(scores.values())
|
||||
@@ -150,7 +159,7 @@ class LanguageDetector:
|
||||
code=best_lang,
|
||||
name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()),
|
||||
confidence=min(confidence, 1.0),
|
||||
script=self.SCRIPTS.get(best_lang)
|
||||
script=self.SCRIPTS.get(best_lang),
|
||||
)
|
||||
|
||||
def detect_from_filename(self, filename: str) -> str | None:
|
||||
@@ -170,12 +179,12 @@ class LanguageDetector:
|
||||
ISO 639-1 language code or None
|
||||
"""
|
||||
# Pattern: file.en.md
|
||||
match = re.search(r'\.([a-z]{2})\.md$', filename)
|
||||
match = re.search(r"\.([a-z]{2})\.md$", filename)
|
||||
if match and match.group(1) in self.LANGUAGE_NAMES:
|
||||
return match.group(1)
|
||||
|
||||
# Pattern: file_en.md or file-en.md
|
||||
match = re.search(r'[_-]([a-z]{2})\.md$', filename)
|
||||
match = re.search(r"[_-]([a-z]{2})\.md$", filename)
|
||||
if match and match.group(1) in self.LANGUAGE_NAMES:
|
||||
return match.group(1)
|
||||
|
||||
@@ -200,7 +209,7 @@ class MultiLanguageManager:
|
||||
file_path: str,
|
||||
content: str,
|
||||
metadata: dict | None = None,
|
||||
force_language: str | None = None
|
||||
force_language: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Add document with language detection.
|
||||
@@ -218,7 +227,7 @@ class MultiLanguageManager:
|
||||
code=lang_code,
|
||||
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||||
confidence=1.0,
|
||||
script=self.detector.SCRIPTS.get(lang_code)
|
||||
script=self.detector.SCRIPTS.get(lang_code),
|
||||
)
|
||||
else:
|
||||
# Try filename pattern first
|
||||
@@ -229,7 +238,7 @@ class MultiLanguageManager:
|
||||
code=lang_code,
|
||||
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||||
confidence=0.95,
|
||||
script=self.detector.SCRIPTS.get(lang_code)
|
||||
script=self.detector.SCRIPTS.get(lang_code),
|
||||
)
|
||||
else:
|
||||
# Detect from content
|
||||
@@ -245,13 +254,13 @@ class MultiLanguageManager:
|
||||
self.documents[lang_code] = []
|
||||
|
||||
doc = {
|
||||
'file_path': file_path,
|
||||
'content': content,
|
||||
'language': lang_info.code,
|
||||
'language_name': lang_info.name,
|
||||
'confidence': lang_info.confidence,
|
||||
'script': lang_info.script,
|
||||
'metadata': metadata or {}
|
||||
"file_path": file_path,
|
||||
"content": content,
|
||||
"language": lang_info.code,
|
||||
"language_name": lang_info.name,
|
||||
"confidence": lang_info.confidence,
|
||||
"script": lang_info.script,
|
||||
"metadata": metadata or {},
|
||||
}
|
||||
|
||||
self.documents[lang_code].append(doc)
|
||||
@@ -284,7 +293,7 @@ class MultiLanguageManager:
|
||||
Returns:
|
||||
Translation status summary
|
||||
"""
|
||||
base_lang = base_language or self.primary_language or 'en'
|
||||
base_lang = base_language or self.primary_language or "en"
|
||||
|
||||
all_languages = set(self.documents.keys())
|
||||
base_count = self.get_document_count(base_lang)
|
||||
@@ -295,7 +304,7 @@ class MultiLanguageManager:
|
||||
target_languages=[],
|
||||
translated_languages=set(),
|
||||
missing_languages=set(),
|
||||
completeness=0.0
|
||||
completeness=0.0,
|
||||
)
|
||||
|
||||
# Check which languages have translations
|
||||
@@ -305,7 +314,7 @@ class MultiLanguageManager:
|
||||
translated.add(lang)
|
||||
|
||||
# Commonly expected languages for completeness
|
||||
expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'}
|
||||
expected_languages = {"en", "es", "fr", "de", "zh", "ja"}
|
||||
missing = expected_languages - all_languages
|
||||
|
||||
completeness = len(all_languages) / len(expected_languages)
|
||||
@@ -315,7 +324,7 @@ class MultiLanguageManager:
|
||||
target_languages=list(all_languages - {base_lang}),
|
||||
translated_languages=translated,
|
||||
missing_languages=missing,
|
||||
completeness=min(completeness, 1.0)
|
||||
completeness=min(completeness, 1.0),
|
||||
)
|
||||
|
||||
def export_by_language(self, output_dir: Path) -> dict[str, Path]:
|
||||
@@ -337,10 +346,10 @@ class MultiLanguageManager:
|
||||
lang_file = output_dir / f"documents_{lang_code}.json"
|
||||
|
||||
export_data = {
|
||||
'language': lang_code,
|
||||
'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||||
'document_count': len(docs),
|
||||
'documents': docs
|
||||
"language": lang_code,
|
||||
"language_name": self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
|
||||
"document_count": len(docs),
|
||||
"documents": docs,
|
||||
}
|
||||
|
||||
lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False))
|
||||
@@ -419,9 +428,7 @@ def main():
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
manager.add_document(
|
||||
"SKILL.md",
|
||||
skill_md.read_text(encoding="utf-8"),
|
||||
{"category": "overview"}
|
||||
"SKILL.md", skill_md.read_text(encoding="utf-8"), {"category": "overview"}
|
||||
)
|
||||
|
||||
# Load reference files
|
||||
@@ -429,9 +436,7 @@ def main():
|
||||
if refs_dir.exists():
|
||||
for ref_file in refs_dir.glob("*.md"):
|
||||
manager.add_document(
|
||||
ref_file.name,
|
||||
ref_file.read_text(encoding="utf-8"),
|
||||
{"category": ref_file.stem}
|
||||
ref_file.name, ref_file.read_text(encoding="utf-8"), {"category": ref_file.stem}
|
||||
)
|
||||
|
||||
# Detect languages
|
||||
@@ -460,4 +465,5 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
@@ -113,7 +113,15 @@ def package_skill(
|
||||
output_dir = skill_path.parent
|
||||
|
||||
# Auto-enable chunking for RAG platforms
|
||||
RAG_PLATFORMS = ['langchain', 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant']
|
||||
RAG_PLATFORMS = [
|
||||
"langchain",
|
||||
"llama-index",
|
||||
"haystack",
|
||||
"weaviate",
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
]
|
||||
|
||||
if target in RAG_PLATFORMS and not enable_chunking:
|
||||
print(f"ℹ️ Auto-enabling chunking for {target} platform")
|
||||
@@ -126,17 +134,19 @@ def package_skill(
|
||||
if streaming:
|
||||
print(f" Mode: Streaming (chunk_size={chunk_size}, overlap={chunk_overlap})")
|
||||
elif enable_chunking:
|
||||
print(f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})")
|
||||
print(
|
||||
f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})"
|
||||
)
|
||||
|
||||
try:
|
||||
# Use streaming if requested and supported
|
||||
if streaming and hasattr(adaptor, 'package_streaming'):
|
||||
if streaming and hasattr(adaptor, "package_streaming"):
|
||||
package_path = adaptor.package_streaming(
|
||||
skill_path,
|
||||
output_dir,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size
|
||||
batch_size=batch_size,
|
||||
)
|
||||
elif streaming:
|
||||
print("⚠️ Streaming not supported for this platform, using standard packaging")
|
||||
@@ -145,7 +155,7 @@ def package_skill(
|
||||
output_dir,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
else:
|
||||
package_path = adaptor.package(
|
||||
@@ -153,7 +163,7 @@ def package_skill(
|
||||
output_dir,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
print(f" Output: {package_path}")
|
||||
@@ -212,7 +222,19 @@ Examples:
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "haystack", "weaviate", "chroma", "faiss", "qdrant"],
|
||||
choices=[
|
||||
"claude",
|
||||
"gemini",
|
||||
"openai",
|
||||
"markdown",
|
||||
"langchain",
|
||||
"llama-index",
|
||||
"haystack",
|
||||
"weaviate",
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
This module registers all subcommand parsers and provides a factory
|
||||
function to create them.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
# Import all parser classes
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Analyze subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -21,26 +22,26 @@ class AnalyzeParser(SubcommandParser):
|
||||
"""Add analyze-specific arguments."""
|
||||
parser.add_argument("--directory", required=True, help="Directory to analyze")
|
||||
parser.add_argument(
|
||||
"--output", default="output/codebase/", help="Output directory (default: output/codebase/)"
|
||||
"--output",
|
||||
default="output/codebase/",
|
||||
help="Output directory (default: output/codebase/)",
|
||||
)
|
||||
|
||||
# Preset selection (NEW - recommended way)
|
||||
parser.add_argument(
|
||||
"--preset",
|
||||
choices=["quick", "standard", "comprehensive"],
|
||||
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
|
||||
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preset-list",
|
||||
action="store_true",
|
||||
help="Show available presets and exit"
|
||||
"--preset-list", action="store_true", help="Show available presets and exit"
|
||||
)
|
||||
|
||||
# Legacy preset flags (kept for backward compatibility)
|
||||
parser.add_argument(
|
||||
"--quick",
|
||||
action="store_true",
|
||||
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
|
||||
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--comprehensive",
|
||||
@@ -71,15 +72,9 @@ class AnalyzeParser(SubcommandParser):
|
||||
help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full",
|
||||
)
|
||||
parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs")
|
||||
parser.add_argument(
|
||||
"--skip-dependency-graph", action="store_true", help="Skip dep graph"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-patterns", action="store_true", help="Skip pattern detection"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-test-examples", action="store_true", help="Skip test examples"
|
||||
)
|
||||
parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph")
|
||||
parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection")
|
||||
parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
|
||||
parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
|
||||
parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
|
||||
parser.add_argument(
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Base parser class for subcommands."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import argparse
|
||||
|
||||
@@ -48,10 +49,6 @@ class SubcommandParser(ABC):
|
||||
Returns:
|
||||
Configured ArgumentParser for this subcommand
|
||||
"""
|
||||
parser = subparsers.add_parser(
|
||||
self.name,
|
||||
help=self.help,
|
||||
description=self.description
|
||||
)
|
||||
parser = subparsers.add_parser(self.name, help=self.help, description=self.description)
|
||||
self.add_arguments(parser)
|
||||
return parser
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Config subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -22,9 +23,7 @@ class ConfigParser(SubcommandParser):
|
||||
parser.add_argument(
|
||||
"--github", action="store_true", help="Go directly to GitHub token setup"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-keys", action="store_true", help="Go directly to API keys setup"
|
||||
)
|
||||
parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
|
||||
parser.add_argument(
|
||||
"--show", action="store_true", help="Show current configuration and exit"
|
||||
)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Enhance subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Enhance-status subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -20,10 +21,6 @@ class EnhanceStatusParser(SubcommandParser):
|
||||
def add_arguments(self, parser):
|
||||
"""Add enhance-status-specific arguments."""
|
||||
parser.add_argument("skill_directory", help="Skill directory path")
|
||||
parser.add_argument(
|
||||
"--watch", "-w", action="store_true", help="Watch in real-time"
|
||||
)
|
||||
parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time")
|
||||
parser.add_argument("--json", action="store_true", help="JSON output")
|
||||
parser.add_argument(
|
||||
"--interval", type=int, default=2, help="Watch interval in seconds"
|
||||
)
|
||||
parser.add_argument("--interval", type=int, default=2, help="Watch interval in seconds")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Estimate subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""GitHub subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -24,9 +25,7 @@ class GitHubParser(SubcommandParser):
|
||||
parser.add_argument("--name", help="Skill name")
|
||||
parser.add_argument("--description", help="Skill description")
|
||||
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
|
||||
parser.add_argument(
|
||||
"--enhance-local", action="store_true", help="AI enhancement (local)"
|
||||
)
|
||||
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
|
||||
parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance")
|
||||
parser.add_argument(
|
||||
"--non-interactive",
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Install-agent subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -19,9 +20,7 @@ class InstallAgentParser(SubcommandParser):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add install-agent-specific arguments."""
|
||||
parser.add_argument(
|
||||
"skill_directory", help="Skill directory path (e.g., output/react/)"
|
||||
)
|
||||
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
|
||||
parser.add_argument(
|
||||
"--agent",
|
||||
required=True,
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Install subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Multilang subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Package subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -20,27 +21,72 @@ class PackageParser(SubcommandParser):
|
||||
def add_arguments(self, parser):
|
||||
"""Add package-specific arguments."""
|
||||
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
|
||||
parser.add_argument("--no-open", action="store_true", help="Don't open output folder after packaging")
|
||||
parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging")
|
||||
parser.add_argument(
|
||||
"--no-open", action="store_true", help="Don't open output folder after packaging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-quality-check", action="store_true", help="Skip quality checks before packaging"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=[
|
||||
"claude", "gemini", "openai", "markdown",
|
||||
"langchain", "llama-index", "haystack",
|
||||
"weaviate", "chroma", "faiss", "qdrant"
|
||||
"claude",
|
||||
"gemini",
|
||||
"openai",
|
||||
"markdown",
|
||||
"langchain",
|
||||
"llama-index",
|
||||
"haystack",
|
||||
"weaviate",
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
parser.add_argument("--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)")
|
||||
parser.add_argument(
|
||||
"--upload",
|
||||
action="store_true",
|
||||
help="Automatically upload after packaging (requires platform API key)",
|
||||
)
|
||||
|
||||
# Streaming options
|
||||
parser.add_argument("--streaming", action="store_true", help="Use streaming ingestion for large docs (memory-efficient)")
|
||||
parser.add_argument("--chunk-size", type=int, default=4000, help="Maximum characters per chunk (streaming mode, default: 4000)")
|
||||
parser.add_argument("--chunk-overlap", type=int, default=200, help="Overlap between chunks (streaming mode, default: 200)")
|
||||
parser.add_argument("--batch-size", type=int, default=100, help="Number of chunks per batch (streaming mode, default: 100)")
|
||||
parser.add_argument(
|
||||
"--streaming",
|
||||
action="store_true",
|
||||
help="Use streaming ingestion for large docs (memory-efficient)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-size",
|
||||
type=int,
|
||||
default=4000,
|
||||
help="Maximum characters per chunk (streaming mode, default: 4000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-overlap",
|
||||
type=int,
|
||||
default=200,
|
||||
help="Overlap between chunks (streaming mode, default: 200)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of chunks per batch (streaming mode, default: 100)",
|
||||
)
|
||||
|
||||
# RAG chunking options
|
||||
parser.add_argument("--chunk", action="store_true", help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)")
|
||||
parser.add_argument("--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)")
|
||||
parser.add_argument("--no-preserve-code", action="store_true", help="Allow code block splitting (default: code blocks preserved)")
|
||||
parser.add_argument(
|
||||
"--chunk",
|
||||
action="store_true",
|
||||
help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-preserve-code",
|
||||
action="store_true",
|
||||
help="Allow code block splitting (default: code blocks preserved)",
|
||||
)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""PDF subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Quality subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Resume subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Scrape subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -24,15 +25,16 @@ class ScrapeParser(SubcommandParser):
|
||||
parser.add_argument("--name", help="Skill name")
|
||||
parser.add_argument("--description", help="Skill description")
|
||||
parser.add_argument(
|
||||
"--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)"
|
||||
"--max-pages",
|
||||
type=int,
|
||||
dest="max_pages",
|
||||
help="Maximum pages to scrape (override config)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
|
||||
)
|
||||
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
|
||||
parser.add_argument(
|
||||
"--enhance-local", action="store_true", help="AI enhancement (local)"
|
||||
)
|
||||
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
|
||||
parser.add_argument(
|
||||
"--async", dest="async_mode", action="store_true", help="Use async scraping"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Stream subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Extract-test-examples subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -19,9 +20,7 @@ class TestExamplesParser(SubcommandParser):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add extract-test-examples-specific arguments."""
|
||||
parser.add_argument(
|
||||
"directory", nargs="?", help="Directory containing test files"
|
||||
)
|
||||
parser.add_argument("directory", nargs="?", help="Directory containing test files")
|
||||
parser.add_argument("--file", help="Single test file to analyze")
|
||||
parser.add_argument(
|
||||
"--language", help="Filter by programming language (python, javascript, etc.)"
|
||||
@@ -36,6 +35,4 @@ class TestExamplesParser(SubcommandParser):
|
||||
"--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)"
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON format")
|
||||
parser.add_argument(
|
||||
"--markdown", action="store_true", help="Output Markdown format"
|
||||
)
|
||||
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Unified subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Update subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
"""Upload subcommand parser."""
|
||||
|
||||
from .base import SubcommandParser
|
||||
|
||||
|
||||
@@ -19,7 +20,9 @@ class UploadParser(SubcommandParser):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add upload-specific arguments."""
|
||||
parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)")
|
||||
parser.add_argument(
|
||||
"package_file", help="Path to skill package file (e.g., output/react.zip)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
@@ -33,22 +36,34 @@ class UploadParser(SubcommandParser):
|
||||
# ChromaDB upload options
|
||||
parser.add_argument(
|
||||
"--chroma-url",
|
||||
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
|
||||
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--persist-directory",
|
||||
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
|
||||
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
|
||||
)
|
||||
|
||||
# Embedding options
|
||||
parser.add_argument(
|
||||
"--embedding-function",
|
||||
choices=["openai", "sentence-transformers", "none"],
|
||||
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
|
||||
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
|
||||
)
|
||||
parser.add_argument("--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)")
|
||||
|
||||
# Weaviate upload options
|
||||
parser.add_argument("--weaviate-url", default="http://localhost:8080", help="Weaviate URL (default: http://localhost:8080)")
|
||||
parser.add_argument("--use-cloud", action="store_true", help="Use Weaviate Cloud (requires --api-key and --cluster-url)")
|
||||
parser.add_argument("--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)")
|
||||
parser.add_argument(
|
||||
"--weaviate-url",
|
||||
default="http://localhost:8080",
|
||||
help="Weaviate URL (default: http://localhost:8080)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-cloud",
|
||||
action="store_true",
|
||||
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
|
||||
)
|
||||
|
||||
@@ -30,14 +30,14 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
# Confidence thresholds for pattern filtering (Issue #240)
|
||||
CONFIDENCE_THRESHOLDS = {
|
||||
'critical': 0.80, # High-confidence patterns for ARCHITECTURE.md
|
||||
'high': 0.70, # Include in detailed analysis
|
||||
'medium': 0.60, # Include with warning/context
|
||||
'low': 0.50, # Minimum detection threshold
|
||||
"critical": 0.80, # High-confidence patterns for ARCHITECTURE.md
|
||||
"high": 0.70, # Include in detailed analysis
|
||||
"medium": 0.60, # Include with warning/context
|
||||
"low": 0.50, # Minimum detection threshold
|
||||
}
|
||||
|
||||
# Default minimum confidence for pattern detection
|
||||
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS['low']
|
||||
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS["low"]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -1697,9 +1697,11 @@ def create_multi_level_report(pattern_results: list[dict]) -> dict:
|
||||
all_patterns_sorted = sorted(all_patterns, key=lambda p: p.get("confidence", 0.0), reverse=True)
|
||||
|
||||
# Filter by confidence levels
|
||||
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['critical'])
|
||||
high_confidence = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['high'])
|
||||
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['medium'])
|
||||
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["critical"])
|
||||
high_confidence = filter_patterns_by_confidence(
|
||||
all_patterns_sorted, CONFIDENCE_THRESHOLDS["high"]
|
||||
)
|
||||
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["medium"])
|
||||
|
||||
return {
|
||||
"all_patterns": all_patterns_sorted,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
Provides predefined analysis configurations with clear trade-offs
|
||||
between speed and comprehensiveness.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@@ -13,6 +14,7 @@ class AnalysisPreset:
|
||||
Defines a complete analysis configuration including depth,
|
||||
feature flags, and AI enhancement level.
|
||||
"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
depth: str # surface, deep, full
|
||||
@@ -29,54 +31,52 @@ PRESETS = {
|
||||
description="Fast basic analysis (1-2 min, essential features only)",
|
||||
depth="surface",
|
||||
features={
|
||||
"api_reference": True, # ON - Essential for API docs
|
||||
"api_reference": True, # ON - Essential for API docs
|
||||
"dependency_graph": False, # OFF - Slow, not critical for quick
|
||||
"patterns": False, # OFF - Slow pattern detection
|
||||
"test_examples": False, # OFF - Time-consuming extraction
|
||||
"how_to_guides": False, # OFF - Requires AI enhancement
|
||||
"config_patterns": False, # OFF - Not critical for quick scan
|
||||
"docs": True, # ON - README/docs are essential
|
||||
"patterns": False, # OFF - Slow pattern detection
|
||||
"test_examples": False, # OFF - Time-consuming extraction
|
||||
"how_to_guides": False, # OFF - Requires AI enhancement
|
||||
"config_patterns": False, # OFF - Not critical for quick scan
|
||||
"docs": True, # ON - README/docs are essential
|
||||
},
|
||||
enhance_level=0, # No AI enhancement (fast)
|
||||
estimated_time="1-2 minutes",
|
||||
icon="⚡"
|
||||
icon="⚡",
|
||||
),
|
||||
|
||||
"standard": AnalysisPreset(
|
||||
name="Standard",
|
||||
description="Balanced analysis (5-10 min, core features, DEFAULT)",
|
||||
depth="deep",
|
||||
features={
|
||||
"api_reference": True, # ON - Core feature
|
||||
"dependency_graph": True, # ON - Valuable insights
|
||||
"patterns": True, # ON - Design pattern detection
|
||||
"test_examples": True, # ON - Real usage examples
|
||||
"how_to_guides": False, # OFF - Requires AI (slow)
|
||||
"config_patterns": True, # ON - Configuration docs
|
||||
"docs": True, # ON - Project documentation
|
||||
"api_reference": True, # ON - Core feature
|
||||
"dependency_graph": True, # ON - Valuable insights
|
||||
"patterns": True, # ON - Design pattern detection
|
||||
"test_examples": True, # ON - Real usage examples
|
||||
"how_to_guides": False, # OFF - Requires AI (slow)
|
||||
"config_patterns": True, # ON - Configuration docs
|
||||
"docs": True, # ON - Project documentation
|
||||
},
|
||||
enhance_level=1, # SKILL.md enhancement only
|
||||
estimated_time="5-10 minutes",
|
||||
icon="🎯"
|
||||
icon="🎯",
|
||||
),
|
||||
|
||||
"comprehensive": AnalysisPreset(
|
||||
name="Comprehensive",
|
||||
description="Full analysis (20-60 min, all features + AI)",
|
||||
depth="full",
|
||||
features={
|
||||
"api_reference": True, # ON - Complete API docs
|
||||
"dependency_graph": True, # ON - Full dependency analysis
|
||||
"patterns": True, # ON - All design patterns
|
||||
"test_examples": True, # ON - All test examples
|
||||
"how_to_guides": True, # ON - AI-generated guides
|
||||
"config_patterns": True, # ON - All configuration patterns
|
||||
"docs": True, # ON - All project docs
|
||||
"api_reference": True, # ON - Complete API docs
|
||||
"dependency_graph": True, # ON - Full dependency analysis
|
||||
"patterns": True, # ON - All design patterns
|
||||
"test_examples": True, # ON - All test examples
|
||||
"how_to_guides": True, # ON - AI-generated guides
|
||||
"config_patterns": True, # ON - All configuration patterns
|
||||
"docs": True, # ON - All project docs
|
||||
},
|
||||
enhance_level=3, # Full AI enhancement (all features)
|
||||
estimated_time="20-60 minutes",
|
||||
icon="🚀"
|
||||
)
|
||||
icon="🚀",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -142,10 +142,7 @@ class PresetManager:
|
||||
raise ValueError(f"Unknown preset: {preset_name}")
|
||||
|
||||
# Start with preset defaults
|
||||
updated_args = {
|
||||
'depth': preset.depth,
|
||||
'enhance_level': preset.enhance_level
|
||||
}
|
||||
updated_args = {"depth": preset.depth, "enhance_level": preset.enhance_level}
|
||||
|
||||
# Convert feature flags to skip_* arguments
|
||||
# feature=False → skip_feature=True (disabled)
|
||||
|
||||
@@ -16,6 +16,7 @@ from enum import Enum
|
||||
|
||||
class MetricLevel(Enum):
|
||||
"""Metric severity level."""
|
||||
|
||||
INFO = "info"
|
||||
WARNING = "warning"
|
||||
ERROR = "error"
|
||||
@@ -25,6 +26,7 @@ class MetricLevel(Enum):
|
||||
@dataclass
|
||||
class QualityMetric:
|
||||
"""Individual quality metric."""
|
||||
|
||||
name: str
|
||||
value: float # 0.0-1.0 (or 0-100 percentage)
|
||||
level: MetricLevel
|
||||
@@ -35,6 +37,7 @@ class QualityMetric:
|
||||
@dataclass
|
||||
class QualityScore:
|
||||
"""Overall quality score."""
|
||||
|
||||
total_score: float # 0-100
|
||||
completeness: float # 0-100
|
||||
accuracy: float # 0-100
|
||||
@@ -46,6 +49,7 @@ class QualityScore:
|
||||
@dataclass
|
||||
class QualityReport:
|
||||
"""Complete quality report."""
|
||||
|
||||
timestamp: str
|
||||
skill_name: str
|
||||
overall_score: QualityScore
|
||||
@@ -64,10 +68,17 @@ class QualityAnalyzer:
|
||||
|
||||
# Thresholds for quality grades
|
||||
GRADE_THRESHOLDS = {
|
||||
'A+': 95, 'A': 90, 'A-': 85,
|
||||
'B+': 80, 'B': 75, 'B-': 70,
|
||||
'C+': 65, 'C': 60, 'C-': 55,
|
||||
'D': 50, 'F': 0
|
||||
"A+": 95,
|
||||
"A": 90,
|
||||
"A-": 85,
|
||||
"B+": 80,
|
||||
"B": 75,
|
||||
"B-": 70,
|
||||
"C+": 65,
|
||||
"C": 60,
|
||||
"C-": 55,
|
||||
"D": 50,
|
||||
"F": 0,
|
||||
}
|
||||
|
||||
def __init__(self, skill_dir: Path):
|
||||
@@ -102,7 +113,7 @@ class QualityAnalyzer:
|
||||
score += 10
|
||||
|
||||
# Has sections (10 points)
|
||||
if content.count('#') >= 5:
|
||||
if content.count("#") >= 5:
|
||||
score += 10
|
||||
|
||||
# References directory (20 points)
|
||||
@@ -134,13 +145,15 @@ class QualityAnalyzer:
|
||||
if len(suggestions) == 0:
|
||||
suggestions.append("Expand documentation coverage")
|
||||
|
||||
self.metrics.append(QualityMetric(
|
||||
name="Completeness",
|
||||
value=completeness,
|
||||
level=level,
|
||||
description=f"Documentation completeness: {completeness:.1f}%",
|
||||
suggestions=suggestions
|
||||
))
|
||||
self.metrics.append(
|
||||
QualityMetric(
|
||||
name="Completeness",
|
||||
value=completeness,
|
||||
level=level,
|
||||
description=f"Documentation completeness: {completeness:.1f}%",
|
||||
suggestions=suggestions,
|
||||
)
|
||||
)
|
||||
|
||||
return completeness
|
||||
|
||||
@@ -166,14 +179,14 @@ class QualityAnalyzer:
|
||||
content = skill_md.read_text(encoding="utf-8")
|
||||
|
||||
# Check for TODO markers (deduct 5 points each, max 20)
|
||||
todo_count = content.lower().count('todo')
|
||||
todo_count = content.lower().count("todo")
|
||||
if todo_count > 0:
|
||||
deduction = min(todo_count * 5, 20)
|
||||
score -= deduction
|
||||
issues.append(f"Found {todo_count} TODO markers")
|
||||
|
||||
# Check for placeholder text (deduct 10)
|
||||
placeholders = ['lorem ipsum', 'placeholder', 'coming soon']
|
||||
placeholders = ["lorem ipsum", "placeholder", "coming soon"]
|
||||
for placeholder in placeholders:
|
||||
if placeholder in content.lower():
|
||||
score -= 10
|
||||
@@ -195,13 +208,15 @@ class QualityAnalyzer:
|
||||
if accuracy < 100 and issues:
|
||||
suggestions.extend(issues[:3]) # Top 3 issues
|
||||
|
||||
self.metrics.append(QualityMetric(
|
||||
name="Accuracy",
|
||||
value=accuracy,
|
||||
level=level,
|
||||
description=f"Documentation accuracy: {accuracy:.1f}%",
|
||||
suggestions=suggestions
|
||||
))
|
||||
self.metrics.append(
|
||||
QualityMetric(
|
||||
name="Accuracy",
|
||||
value=accuracy,
|
||||
level=level,
|
||||
description=f"Documentation accuracy: {accuracy:.1f}%",
|
||||
suggestions=suggestions,
|
||||
)
|
||||
)
|
||||
|
||||
return accuracy
|
||||
|
||||
@@ -234,13 +249,13 @@ class QualityAnalyzer:
|
||||
# Check for specific types (20 points each)
|
||||
ref_names = [f.stem.lower() for f in ref_files]
|
||||
|
||||
if any('getting' in name or 'start' in name for name in ref_names):
|
||||
if any("getting" in name or "start" in name for name in ref_names):
|
||||
score += 20
|
||||
|
||||
if any('api' in name or 'reference' in name for name in ref_names):
|
||||
if any("api" in name or "reference" in name for name in ref_names):
|
||||
score += 20
|
||||
|
||||
if any('example' in name or 'tutorial' in name for name in ref_names):
|
||||
if any("example" in name or "tutorial" in name for name in ref_names):
|
||||
score += 20
|
||||
|
||||
# Has diverse content (10 points)
|
||||
@@ -258,13 +273,15 @@ class QualityAnalyzer:
|
||||
suggestions.append("Add API reference documentation")
|
||||
suggestions.append("Expand documentation coverage")
|
||||
|
||||
self.metrics.append(QualityMetric(
|
||||
name="Coverage",
|
||||
value=coverage,
|
||||
level=level,
|
||||
description=f"Documentation coverage: {coverage:.1f}%",
|
||||
suggestions=suggestions
|
||||
))
|
||||
self.metrics.append(
|
||||
QualityMetric(
|
||||
name="Coverage",
|
||||
value=coverage,
|
||||
level=level,
|
||||
description=f"Documentation coverage: {coverage:.1f}%",
|
||||
suggestions=suggestions,
|
||||
)
|
||||
)
|
||||
|
||||
return coverage
|
||||
|
||||
@@ -308,56 +325,54 @@ class QualityAnalyzer:
|
||||
if health < 100:
|
||||
suggestions.extend(issues[:3])
|
||||
|
||||
self.metrics.append(QualityMetric(
|
||||
name="Health",
|
||||
value=health,
|
||||
level=level,
|
||||
description=f"Skill health: {health:.1f}%",
|
||||
suggestions=suggestions
|
||||
))
|
||||
self.metrics.append(
|
||||
QualityMetric(
|
||||
name="Health",
|
||||
value=health,
|
||||
level=level,
|
||||
description=f"Skill health: {health:.1f}%",
|
||||
suggestions=suggestions,
|
||||
)
|
||||
)
|
||||
|
||||
return health
|
||||
|
||||
def calculate_statistics(self) -> dict[str, Any]:
|
||||
"""Calculate skill statistics."""
|
||||
stats = {
|
||||
'total_files': 0,
|
||||
'total_size_bytes': 0,
|
||||
'markdown_files': 0,
|
||||
'reference_files': 0,
|
||||
'total_characters': 0,
|
||||
'total_words': 0
|
||||
"total_files": 0,
|
||||
"total_size_bytes": 0,
|
||||
"markdown_files": 0,
|
||||
"reference_files": 0,
|
||||
"total_characters": 0,
|
||||
"total_words": 0,
|
||||
}
|
||||
|
||||
# Count files and sizes
|
||||
for md_file in self.skill_dir.rglob("*.md"):
|
||||
stats['total_files'] += 1
|
||||
stats['markdown_files'] += 1
|
||||
stats["total_files"] += 1
|
||||
stats["markdown_files"] += 1
|
||||
size = md_file.stat().st_size
|
||||
stats['total_size_bytes'] += size
|
||||
stats["total_size_bytes"] += size
|
||||
|
||||
# Count words
|
||||
try:
|
||||
content = md_file.read_text(encoding="utf-8")
|
||||
stats['total_characters'] += len(content)
|
||||
stats['total_words'] += len(content.split())
|
||||
stats["total_characters"] += len(content)
|
||||
stats["total_words"] += len(content.split())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Count references
|
||||
refs_dir = self.skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
stats['reference_files'] = len(list(refs_dir.glob("*.md")))
|
||||
stats["reference_files"] = len(list(refs_dir.glob("*.md")))
|
||||
|
||||
self.statistics = stats
|
||||
return stats
|
||||
|
||||
def calculate_overall_score(
|
||||
self,
|
||||
completeness: float,
|
||||
accuracy: float,
|
||||
coverage: float,
|
||||
health: float
|
||||
self, completeness: float, accuracy: float, coverage: float, health: float
|
||||
) -> QualityScore:
|
||||
"""
|
||||
Calculate overall quality score.
|
||||
@@ -368,15 +383,10 @@ class QualityAnalyzer:
|
||||
- Coverage: 25%
|
||||
- Health: 20%
|
||||
"""
|
||||
total = (
|
||||
completeness * 0.30 +
|
||||
accuracy * 0.25 +
|
||||
coverage * 0.25 +
|
||||
health * 0.20
|
||||
)
|
||||
total = completeness * 0.30 + accuracy * 0.25 + coverage * 0.25 + health * 0.20
|
||||
|
||||
# Determine grade
|
||||
grade = 'F'
|
||||
grade = "F"
|
||||
for g, threshold in self.GRADE_THRESHOLDS.items():
|
||||
if total >= threshold:
|
||||
grade = g
|
||||
@@ -388,7 +398,7 @@ class QualityAnalyzer:
|
||||
accuracy=accuracy,
|
||||
coverage=coverage,
|
||||
health=health,
|
||||
grade=grade
|
||||
grade=grade,
|
||||
)
|
||||
|
||||
def generate_recommendations(self, score: QualityScore) -> list[str]:
|
||||
@@ -431,9 +441,7 @@ class QualityAnalyzer:
|
||||
health = self.analyze_health()
|
||||
|
||||
# Calculate overall score
|
||||
overall_score = self.calculate_overall_score(
|
||||
completeness, accuracy, coverage, health
|
||||
)
|
||||
overall_score = self.calculate_overall_score(completeness, accuracy, coverage, health)
|
||||
|
||||
# Calculate statistics
|
||||
stats = self.calculate_statistics()
|
||||
@@ -447,7 +455,7 @@ class QualityAnalyzer:
|
||||
overall_score=overall_score,
|
||||
metrics=self.metrics,
|
||||
statistics=stats,
|
||||
recommendations=recommendations
|
||||
recommendations=recommendations,
|
||||
)
|
||||
|
||||
def format_report(self, report: QualityReport) -> str:
|
||||
@@ -484,7 +492,7 @@ class QualityAnalyzer:
|
||||
MetricLevel.INFO: "✅",
|
||||
MetricLevel.WARNING: "⚠️",
|
||||
MetricLevel.ERROR: "❌",
|
||||
MetricLevel.CRITICAL: "🔴"
|
||||
MetricLevel.CRITICAL: "🔴",
|
||||
}.get(metric.level, "ℹ️")
|
||||
|
||||
lines.append(f" {icon} {metric.name}: {metric.value:.1f}%")
|
||||
@@ -553,4 +561,5 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
@@ -75,10 +75,7 @@ class RAGChunker:
|
||||
return len(text) // self.chars_per_token
|
||||
|
||||
def chunk_document(
|
||||
self,
|
||||
text: str,
|
||||
metadata: dict,
|
||||
source_file: str | None = None
|
||||
self, text: str, metadata: dict, source_file: str | None = None
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Chunk single document into RAG-ready chunks.
|
||||
@@ -125,11 +122,13 @@ class RAGChunker:
|
||||
if source_file:
|
||||
chunk_metadata["source_file"] = source_file
|
||||
|
||||
result.append({
|
||||
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
|
||||
"page_content": chunk_text.strip(),
|
||||
"metadata": chunk_metadata
|
||||
})
|
||||
result.append(
|
||||
{
|
||||
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
|
||||
"page_content": chunk_text.strip(),
|
||||
"metadata": chunk_metadata,
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Created {len(result)} chunks from {source_file or 'document'} "
|
||||
@@ -153,14 +152,10 @@ class RAGChunker:
|
||||
# Chunk main SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
with open(skill_md, encoding='utf-8') as f:
|
||||
with open(skill_md, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
metadata = {
|
||||
"source": skill_dir.name,
|
||||
"category": "overview",
|
||||
"file_type": "skill_md"
|
||||
}
|
||||
metadata = {"source": skill_dir.name, "category": "overview", "file_type": "skill_md"}
|
||||
|
||||
chunks = self.chunk_document(content, metadata, source_file="SKILL.md")
|
||||
all_chunks.extend(chunks)
|
||||
@@ -169,26 +164,21 @@ class RAGChunker:
|
||||
references_dir = skill_dir / "references"
|
||||
if references_dir.exists():
|
||||
for ref_file in references_dir.glob("*.md"):
|
||||
with open(ref_file, encoding='utf-8') as f:
|
||||
with open(ref_file, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
metadata = {
|
||||
"source": skill_dir.name,
|
||||
"category": ref_file.stem,
|
||||
"file_type": "reference"
|
||||
"file_type": "reference",
|
||||
}
|
||||
|
||||
chunks = self.chunk_document(
|
||||
content,
|
||||
metadata,
|
||||
source_file=str(ref_file.relative_to(skill_dir))
|
||||
content, metadata, source_file=str(ref_file.relative_to(skill_dir))
|
||||
)
|
||||
all_chunks.extend(chunks)
|
||||
|
||||
logger.info(
|
||||
f"Chunked skill directory {skill_dir.name}: "
|
||||
f"{len(all_chunks)} total chunks"
|
||||
)
|
||||
logger.info(f"Chunked skill directory {skill_dir.name}: {len(all_chunks)} total chunks")
|
||||
|
||||
return all_chunks
|
||||
|
||||
@@ -207,32 +197,25 @@ class RAGChunker:
|
||||
|
||||
# Match code blocks (``` fenced blocks)
|
||||
# Use DOTALL flag to match across newlines
|
||||
code_block_pattern = r'```[^\n]*\n.*?```'
|
||||
code_block_pattern = r"```[^\n]*\n.*?```"
|
||||
|
||||
def replacer(match):
|
||||
idx = len(code_blocks)
|
||||
code_blocks.append({
|
||||
"index": idx,
|
||||
"content": match.group(0),
|
||||
"start": match.start(),
|
||||
"end": match.end()
|
||||
})
|
||||
code_blocks.append(
|
||||
{
|
||||
"index": idx,
|
||||
"content": match.group(0),
|
||||
"start": match.start(),
|
||||
"end": match.end(),
|
||||
}
|
||||
)
|
||||
return placeholder_pattern.format(idx=idx)
|
||||
|
||||
text_with_placeholders = re.sub(
|
||||
code_block_pattern,
|
||||
replacer,
|
||||
text,
|
||||
flags=re.DOTALL
|
||||
)
|
||||
text_with_placeholders = re.sub(code_block_pattern, replacer, text, flags=re.DOTALL)
|
||||
|
||||
return text_with_placeholders, code_blocks
|
||||
|
||||
def _reinsert_code_blocks(
|
||||
self,
|
||||
chunks: list[str],
|
||||
code_blocks: list[dict]
|
||||
) -> list[str]:
|
||||
def _reinsert_code_blocks(self, chunks: list[str], code_blocks: list[dict]) -> list[str]:
|
||||
"""
|
||||
Re-insert code blocks into chunks.
|
||||
|
||||
@@ -249,7 +232,7 @@ class RAGChunker:
|
||||
for block in code_blocks:
|
||||
placeholder = f"<<CODE_BLOCK_{block['index']}>>"
|
||||
if placeholder in chunk:
|
||||
chunk = chunk.replace(placeholder, block['content'])
|
||||
chunk = chunk.replace(placeholder, block["content"])
|
||||
result.append(chunk)
|
||||
|
||||
return result
|
||||
@@ -268,15 +251,15 @@ class RAGChunker:
|
||||
|
||||
# Paragraph boundaries (double newline)
|
||||
if self.preserve_paragraphs:
|
||||
for match in re.finditer(r'\n\n+', text):
|
||||
for match in re.finditer(r"\n\n+", text):
|
||||
boundaries.append(match.end())
|
||||
|
||||
# Section headers (# Header)
|
||||
for match in re.finditer(r'\n#{1,6}\s+.+\n', text):
|
||||
for match in re.finditer(r"\n#{1,6}\s+.+\n", text):
|
||||
boundaries.append(match.start())
|
||||
|
||||
# Single newlines (less preferred, but useful)
|
||||
for match in re.finditer(r'\n', text):
|
||||
for match in re.finditer(r"\n", text):
|
||||
boundaries.append(match.start())
|
||||
|
||||
# Add artificial boundaries for large documents
|
||||
@@ -352,7 +335,9 @@ class RAGChunker:
|
||||
|
||||
# Add chunk if it meets minimum size requirement
|
||||
# (unless the entire text is smaller than target size)
|
||||
if chunk_text.strip() and (len(text) <= target_size_chars or len(chunk_text) >= min_size_chars):
|
||||
if chunk_text.strip() and (
|
||||
len(text) <= target_size_chars or len(chunk_text) >= min_size_chars
|
||||
):
|
||||
chunks.append(chunk_text)
|
||||
|
||||
# Move to next chunk with overlap
|
||||
@@ -383,7 +368,7 @@ class RAGChunker:
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(chunks, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Saved {len(chunks)} chunks to {output_path}")
|
||||
@@ -393,7 +378,9 @@ def main():
|
||||
"""CLI entry point for testing RAG chunker."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="RAG Chunker - Semantic chunking for RAG pipelines")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="RAG Chunker - Semantic chunking for RAG pipelines"
|
||||
)
|
||||
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
|
||||
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
|
||||
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")
|
||||
|
||||
@@ -59,27 +59,26 @@ def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
|
||||
account_name='myaccount')
|
||||
"""
|
||||
adaptors = {
|
||||
's3': S3StorageAdaptor,
|
||||
'gcs': GCSStorageAdaptor,
|
||||
'azure': AzureStorageAdaptor,
|
||||
"s3": S3StorageAdaptor,
|
||||
"gcs": GCSStorageAdaptor,
|
||||
"azure": AzureStorageAdaptor,
|
||||
}
|
||||
|
||||
provider_lower = provider.lower()
|
||||
if provider_lower not in adaptors:
|
||||
supported = ', '.join(adaptors.keys())
|
||||
supported = ", ".join(adaptors.keys())
|
||||
raise ValueError(
|
||||
f"Unsupported storage provider: {provider}. "
|
||||
f"Supported providers: {supported}"
|
||||
f"Unsupported storage provider: {provider}. Supported providers: {supported}"
|
||||
)
|
||||
|
||||
return adaptors[provider_lower](**kwargs)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'BaseStorageAdaptor',
|
||||
'StorageObject',
|
||||
'S3StorageAdaptor',
|
||||
'GCSStorageAdaptor',
|
||||
'AzureStorageAdaptor',
|
||||
'get_storage_adaptor',
|
||||
"BaseStorageAdaptor",
|
||||
"StorageObject",
|
||||
"S3StorageAdaptor",
|
||||
"GCSStorageAdaptor",
|
||||
"AzureStorageAdaptor",
|
||||
"get_storage_adaptor",
|
||||
]
|
||||
|
||||
@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
|
||||
try:
|
||||
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
|
||||
from azure.core.exceptions import ResourceNotFoundError
|
||||
|
||||
AZURE_AVAILABLE = True
|
||||
except ImportError:
|
||||
AZURE_AVAILABLE = False
|
||||
@@ -65,38 +66,30 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
"Install with: pip install azure-storage-blob"
|
||||
)
|
||||
|
||||
if 'container' not in kwargs:
|
||||
if "container" not in kwargs:
|
||||
raise ValueError("container parameter is required for Azure storage")
|
||||
|
||||
self.container_name = kwargs['container']
|
||||
self.container_name = kwargs["container"]
|
||||
|
||||
# Initialize BlobServiceClient
|
||||
if 'connection_string' in kwargs:
|
||||
connection_string = kwargs['connection_string']
|
||||
if "connection_string" in kwargs:
|
||||
connection_string = kwargs["connection_string"]
|
||||
else:
|
||||
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
|
||||
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
||||
|
||||
if connection_string:
|
||||
self.blob_service_client = BlobServiceClient.from_connection_string(
|
||||
connection_string
|
||||
)
|
||||
self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
|
||||
# Extract account name from connection string
|
||||
self.account_name = None
|
||||
self.account_key = None
|
||||
for part in connection_string.split(';'):
|
||||
if part.startswith('AccountName='):
|
||||
self.account_name = part.split('=', 1)[1]
|
||||
elif part.startswith('AccountKey='):
|
||||
self.account_key = part.split('=', 1)[1]
|
||||
for part in connection_string.split(";"):
|
||||
if part.startswith("AccountName="):
|
||||
self.account_name = part.split("=", 1)[1]
|
||||
elif part.startswith("AccountKey="):
|
||||
self.account_key = part.split("=", 1)[1]
|
||||
else:
|
||||
account_name = kwargs.get(
|
||||
'account_name',
|
||||
os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
|
||||
)
|
||||
account_key = kwargs.get(
|
||||
'account_key',
|
||||
os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
|
||||
)
|
||||
account_name = kwargs.get("account_name", os.getenv("AZURE_STORAGE_ACCOUNT_NAME"))
|
||||
account_key = kwargs.get("account_key", os.getenv("AZURE_STORAGE_ACCOUNT_KEY"))
|
||||
|
||||
if not account_name or not account_key:
|
||||
raise ValueError(
|
||||
@@ -108,13 +101,10 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
self.account_key = account_key
|
||||
account_url = f"https://{account_name}.blob.core.windows.net"
|
||||
self.blob_service_client = BlobServiceClient(
|
||||
account_url=account_url,
|
||||
credential=account_key
|
||||
account_url=account_url, credential=account_key
|
||||
)
|
||||
|
||||
self.container_client = self.blob_service_client.get_container_client(
|
||||
self.container_name
|
||||
)
|
||||
self.container_client = self.blob_service_client.get_container_client(self.container_name)
|
||||
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
|
||||
@@ -128,11 +118,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
|
||||
with open(local_file, "rb") as data:
|
||||
blob_client.upload_blob(
|
||||
data,
|
||||
overwrite=True,
|
||||
metadata=metadata
|
||||
)
|
||||
blob_client.upload_blob(data, overwrite=True, metadata=metadata)
|
||||
|
||||
return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
|
||||
except Exception as e:
|
||||
@@ -164,25 +150,26 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure deletion failed: {e}") from e
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> list[StorageObject]:
|
||||
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
|
||||
"""List files in Azure container."""
|
||||
try:
|
||||
blobs = self.container_client.list_blobs(
|
||||
name_starts_with=prefix,
|
||||
results_per_page=max_results
|
||||
name_starts_with=prefix, results_per_page=max_results
|
||||
)
|
||||
|
||||
files = []
|
||||
for blob in blobs:
|
||||
files.append(StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata
|
||||
))
|
||||
files.append(
|
||||
StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.last_modified.isoformat()
|
||||
if blob.last_modified
|
||||
else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return files
|
||||
except Exception as e:
|
||||
@@ -205,9 +192,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
|
||||
if not self.account_name or not self.account_key:
|
||||
raise ValueError(
|
||||
"Account name and key are required for SAS URL generation"
|
||||
)
|
||||
raise ValueError("Account name and key are required for SAS URL generation")
|
||||
|
||||
sas_token = generate_blob_sas(
|
||||
account_name=self.account_name,
|
||||
@@ -215,7 +200,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
blob_name=remote_path,
|
||||
account_key=self.account_key,
|
||||
permission=BlobSasPermissions(read=True),
|
||||
expiry=datetime.utcnow() + timedelta(seconds=expires_in)
|
||||
expiry=datetime.utcnow() + timedelta(seconds=expires_in),
|
||||
)
|
||||
|
||||
return f"{blob_client.url}?{sas_token}"
|
||||
@@ -239,12 +224,13 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
|
||||
# Wait for copy to complete
|
||||
properties = dest_blob.get_blob_properties()
|
||||
while properties.copy.status == 'pending':
|
||||
while properties.copy.status == "pending":
|
||||
import time
|
||||
|
||||
time.sleep(0.1)
|
||||
properties = dest_blob.get_blob_properties()
|
||||
|
||||
if properties.copy.status != 'success':
|
||||
if properties.copy.status != "success":
|
||||
raise Exception(f"Copy failed with status: {properties.copy.status}")
|
||||
|
||||
except FileNotFoundError:
|
||||
|
||||
@@ -95,9 +95,7 @@ class BaseStorageAdaptor(ABC):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> list[StorageObject]:
|
||||
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
|
||||
"""
|
||||
List files in cloud storage.
|
||||
|
||||
@@ -191,9 +189,7 @@ class BaseStorageAdaptor(ABC):
|
||||
|
||||
return uploaded_files
|
||||
|
||||
def download_directory(
|
||||
self, remote_prefix: str, local_dir: str
|
||||
) -> list[str]:
|
||||
def download_directory(self, remote_prefix: str, local_dir: str) -> list[str]:
|
||||
"""
|
||||
Download directory from cloud storage.
|
||||
|
||||
@@ -245,9 +241,7 @@ class BaseStorageAdaptor(ABC):
|
||||
raise FileNotFoundError(f"File not found: {remote_path}")
|
||||
return files[0].size
|
||||
|
||||
def copy_file(
|
||||
self, source_path: str, dest_path: str
|
||||
) -> None:
|
||||
def copy_file(self, source_path: str, dest_path: str) -> None:
|
||||
"""
|
||||
Copy file within cloud storage.
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from datetime import timedelta
|
||||
try:
|
||||
from google.cloud import storage
|
||||
from google.cloud.exceptions import NotFound
|
||||
|
||||
GCS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GCS_AVAILABLE = False
|
||||
@@ -63,19 +64,19 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
|
||||
"Install with: pip install google-cloud-storage"
|
||||
)
|
||||
|
||||
if 'bucket' not in kwargs:
|
||||
if "bucket" not in kwargs:
|
||||
raise ValueError("bucket parameter is required for GCS storage")
|
||||
|
||||
self.bucket_name = kwargs['bucket']
|
||||
self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
|
||||
self.bucket_name = kwargs["bucket"]
|
||||
self.project = kwargs.get("project", os.getenv("GOOGLE_CLOUD_PROJECT"))
|
||||
|
||||
# Initialize GCS client
|
||||
client_kwargs = {}
|
||||
if self.project:
|
||||
client_kwargs['project'] = self.project
|
||||
client_kwargs["project"] = self.project
|
||||
|
||||
if 'credentials_path' in kwargs:
|
||||
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
|
||||
if "credentials_path" in kwargs:
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = kwargs["credentials_path"]
|
||||
|
||||
self.storage_client = storage.Client(**client_kwargs)
|
||||
self.bucket = self.storage_client.bucket(self.bucket_name)
|
||||
@@ -122,26 +123,24 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS deletion failed: {e}") from e
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> list[StorageObject]:
|
||||
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
|
||||
"""List files in GCS bucket."""
|
||||
try:
|
||||
blobs = self.storage_client.list_blobs(
|
||||
self.bucket_name,
|
||||
prefix=prefix,
|
||||
max_results=max_results
|
||||
self.bucket_name, prefix=prefix, max_results=max_results
|
||||
)
|
||||
|
||||
files = []
|
||||
for blob in blobs:
|
||||
files.append(StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.updated.isoformat() if blob.updated else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata
|
||||
))
|
||||
files.append(
|
||||
StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.updated.isoformat() if blob.updated else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return files
|
||||
except Exception as e:
|
||||
@@ -164,9 +163,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
|
||||
url = blob.generate_signed_url(
|
||||
version="v4",
|
||||
expiration=timedelta(seconds=expires_in),
|
||||
method="GET"
|
||||
version="v4", expiration=timedelta(seconds=expires_in), method="GET"
|
||||
)
|
||||
return url
|
||||
except FileNotFoundError:
|
||||
@@ -182,11 +179,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
|
||||
if not source_blob.exists():
|
||||
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||
|
||||
self.bucket.copy_blob(
|
||||
source_blob,
|
||||
self.bucket,
|
||||
dest_path
|
||||
)
|
||||
self.bucket.copy_blob(source_blob, self.bucket, dest_path)
|
||||
except FileNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
|
||||
@@ -8,6 +8,7 @@ from pathlib import Path
|
||||
try:
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
BOTO3_AVAILABLE = True
|
||||
except ImportError:
|
||||
BOTO3_AVAILABLE = False
|
||||
@@ -63,33 +64,30 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not BOTO3_AVAILABLE:
|
||||
raise ImportError(
|
||||
"boto3 is required for S3 storage. "
|
||||
"Install with: pip install boto3"
|
||||
)
|
||||
raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3")
|
||||
|
||||
if 'bucket' not in kwargs:
|
||||
if "bucket" not in kwargs:
|
||||
raise ValueError("bucket parameter is required for S3 storage")
|
||||
|
||||
self.bucket = kwargs['bucket']
|
||||
self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
|
||||
self.bucket = kwargs["bucket"]
|
||||
self.region = kwargs.get("region", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
|
||||
|
||||
# Initialize S3 client
|
||||
client_kwargs = {
|
||||
'region_name': self.region,
|
||||
"region_name": self.region,
|
||||
}
|
||||
|
||||
if 'endpoint_url' in kwargs:
|
||||
client_kwargs['endpoint_url'] = kwargs['endpoint_url']
|
||||
if "endpoint_url" in kwargs:
|
||||
client_kwargs["endpoint_url"] = kwargs["endpoint_url"]
|
||||
|
||||
if 'aws_access_key_id' in kwargs:
|
||||
client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
|
||||
if "aws_access_key_id" in kwargs:
|
||||
client_kwargs["aws_access_key_id"] = kwargs["aws_access_key_id"]
|
||||
|
||||
if 'aws_secret_access_key' in kwargs:
|
||||
client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
|
||||
if "aws_secret_access_key" in kwargs:
|
||||
client_kwargs["aws_secret_access_key"] = kwargs["aws_secret_access_key"]
|
||||
|
||||
self.s3_client = boto3.client('s3', **client_kwargs)
|
||||
self.s3_resource = boto3.resource('s3', **client_kwargs)
|
||||
self.s3_client = boto3.client("s3", **client_kwargs)
|
||||
self.s3_resource = boto3.resource("s3", **client_kwargs)
|
||||
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
|
||||
@@ -101,14 +99,14 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
|
||||
extra_args = {}
|
||||
if metadata:
|
||||
extra_args['Metadata'] = metadata
|
||||
extra_args["Metadata"] = metadata
|
||||
|
||||
try:
|
||||
self.s3_client.upload_file(
|
||||
str(local_file),
|
||||
self.bucket,
|
||||
remote_path,
|
||||
ExtraArgs=extra_args if extra_args else None
|
||||
ExtraArgs=extra_args if extra_args else None,
|
||||
)
|
||||
return f"s3://{self.bucket}/{remote_path}"
|
||||
except ClientError as e:
|
||||
@@ -120,50 +118,41 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
self.s3_client.download_file(
|
||||
self.bucket,
|
||||
remote_path,
|
||||
str(local_file)
|
||||
)
|
||||
self.s3_client.download_file(self.bucket, remote_path, str(local_file))
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
if e.response["Error"]["Code"] == "404":
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}") from e
|
||||
raise Exception(f"S3 download failed: {e}") from e
|
||||
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""Delete file from S3."""
|
||||
try:
|
||||
self.s3_client.delete_object(
|
||||
Bucket=self.bucket,
|
||||
Key=remote_path
|
||||
)
|
||||
self.s3_client.delete_object(Bucket=self.bucket, Key=remote_path)
|
||||
except ClientError as e:
|
||||
raise Exception(f"S3 deletion failed: {e}") from e
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> list[StorageObject]:
|
||||
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
|
||||
"""List files in S3 bucket."""
|
||||
try:
|
||||
paginator = self.s3_client.get_paginator('list_objects_v2')
|
||||
paginator = self.s3_client.get_paginator("list_objects_v2")
|
||||
page_iterator = paginator.paginate(
|
||||
Bucket=self.bucket,
|
||||
Prefix=prefix,
|
||||
PaginationConfig={'MaxItems': max_results}
|
||||
Bucket=self.bucket, Prefix=prefix, PaginationConfig={"MaxItems": max_results}
|
||||
)
|
||||
|
||||
files = []
|
||||
for page in page_iterator:
|
||||
if 'Contents' not in page:
|
||||
if "Contents" not in page:
|
||||
continue
|
||||
|
||||
for obj in page['Contents']:
|
||||
files.append(StorageObject(
|
||||
key=obj['Key'],
|
||||
size=obj['Size'],
|
||||
last_modified=obj['LastModified'].isoformat(),
|
||||
etag=obj.get('ETag', '').strip('"')
|
||||
))
|
||||
for obj in page["Contents"]:
|
||||
files.append(
|
||||
StorageObject(
|
||||
key=obj["Key"],
|
||||
size=obj["Size"],
|
||||
last_modified=obj["LastModified"].isoformat(),
|
||||
etag=obj.get("ETag", "").strip('"'),
|
||||
)
|
||||
)
|
||||
|
||||
return files
|
||||
except ClientError as e:
|
||||
@@ -172,13 +161,10 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""Check if file exists in S3."""
|
||||
try:
|
||||
self.s3_client.head_object(
|
||||
Bucket=self.bucket,
|
||||
Key=remote_path
|
||||
)
|
||||
self.s3_client.head_object(Bucket=self.bucket, Key=remote_path)
|
||||
return True
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
if e.response["Error"]["Code"] == "404":
|
||||
return False
|
||||
raise Exception(f"S3 head_object failed: {e}") from e
|
||||
|
||||
@@ -186,12 +172,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
"""Generate presigned URL for S3 object."""
|
||||
try:
|
||||
url = self.s3_client.generate_presigned_url(
|
||||
'get_object',
|
||||
Params={
|
||||
'Bucket': self.bucket,
|
||||
'Key': remote_path
|
||||
},
|
||||
ExpiresIn=expires_in
|
||||
"get_object",
|
||||
Params={"Bucket": self.bucket, "Key": remote_path},
|
||||
ExpiresIn=expires_in,
|
||||
)
|
||||
return url
|
||||
except ClientError as e:
|
||||
@@ -200,16 +183,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
def copy_file(self, source_path: str, dest_path: str) -> None:
|
||||
"""Copy file within S3 bucket (server-side copy)."""
|
||||
try:
|
||||
copy_source = {
|
||||
'Bucket': self.bucket,
|
||||
'Key': source_path
|
||||
}
|
||||
self.s3_client.copy_object(
|
||||
CopySource=copy_source,
|
||||
Bucket=self.bucket,
|
||||
Key=dest_path
|
||||
)
|
||||
copy_source = {"Bucket": self.bucket, "Key": source_path}
|
||||
self.s3_client.copy_object(CopySource=copy_source, Bucket=self.bucket, Key=dest_path)
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
if e.response["Error"]["Code"] == "404":
|
||||
raise FileNotFoundError(f"Source file not found: {source_path}") from e
|
||||
raise Exception(f"S3 copy failed: {e}") from e
|
||||
|
||||
@@ -17,6 +17,7 @@ import time
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
"""Metadata for a document chunk."""
|
||||
|
||||
chunk_id: str
|
||||
source: str
|
||||
category: str
|
||||
@@ -30,6 +31,7 @@ class ChunkMetadata:
|
||||
@dataclass
|
||||
class IngestionProgress:
|
||||
"""Progress tracking for streaming ingestion."""
|
||||
|
||||
total_documents: int
|
||||
processed_documents: int
|
||||
total_chunks: int
|
||||
@@ -81,7 +83,7 @@ class StreamingIngester:
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
batch_size: int = 100,
|
||||
max_memory_mb: int = 500
|
||||
max_memory_mb: int = 500,
|
||||
):
|
||||
"""
|
||||
Initialize streaming ingester.
|
||||
@@ -103,7 +105,7 @@ class StreamingIngester:
|
||||
content: str,
|
||||
metadata: dict,
|
||||
chunk_size: int | None = None,
|
||||
chunk_overlap: int | None = None
|
||||
chunk_overlap: int | None = None,
|
||||
) -> Iterator[tuple[str, ChunkMetadata]]:
|
||||
"""
|
||||
Split document into overlapping chunks.
|
||||
@@ -130,7 +132,7 @@ class StreamingIngester:
|
||||
chunk_index=0,
|
||||
total_chunks=1,
|
||||
char_start=0,
|
||||
char_end=len(content)
|
||||
char_end=len(content),
|
||||
)
|
||||
yield content, chunk_meta
|
||||
return
|
||||
@@ -162,7 +164,7 @@ class StreamingIngester:
|
||||
chunk_index=i,
|
||||
total_chunks=total_chunks,
|
||||
char_start=start,
|
||||
char_end=end
|
||||
char_end=end,
|
||||
)
|
||||
|
||||
yield chunk_text, chunk_meta
|
||||
@@ -170,17 +172,12 @@ class StreamingIngester:
|
||||
def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str:
|
||||
"""Generate deterministic chunk ID."""
|
||||
id_string = (
|
||||
f"{metadata.get('source', '')}-"
|
||||
f"{metadata.get('file', '')}-"
|
||||
f"{chunk_index}-"
|
||||
f"{content[:50]}"
|
||||
f"{metadata.get('source', '')}-{metadata.get('file', '')}-{chunk_index}-{content[:50]}"
|
||||
)
|
||||
return hashlib.md5(id_string.encode()).hexdigest()
|
||||
|
||||
def stream_skill_directory(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
callback: callable | None = None
|
||||
self, skill_dir: Path, callback: callable | None = None
|
||||
) -> Iterator[tuple[str, dict]]:
|
||||
"""
|
||||
Stream all documents from skill directory.
|
||||
@@ -218,7 +215,7 @@ class StreamingIngester:
|
||||
processed_chunks=0,
|
||||
failed_chunks=0,
|
||||
bytes_processed=0,
|
||||
start_time=time.time()
|
||||
start_time=time.time(),
|
||||
)
|
||||
|
||||
# Process each document
|
||||
@@ -235,11 +232,13 @@ class StreamingIngester:
|
||||
"category": category,
|
||||
"file": filename,
|
||||
"type": "documentation" if filename == "SKILL.md" else "reference",
|
||||
"version": "1.0.0"
|
||||
"version": "1.0.0",
|
||||
}
|
||||
|
||||
# Chunk document and yield chunks
|
||||
for chunk_count, (chunk_text, chunk_meta) in enumerate(self.chunk_document(content, metadata), start=1):
|
||||
for chunk_count, (chunk_text, chunk_meta) in enumerate(
|
||||
self.chunk_document(content, metadata), start=1
|
||||
):
|
||||
self.progress.total_chunks += 1
|
||||
|
||||
# Convert chunk metadata to dict
|
||||
@@ -272,9 +271,7 @@ class StreamingIngester:
|
||||
continue
|
||||
|
||||
def batch_iterator(
|
||||
self,
|
||||
chunks: Iterator[tuple[str, dict]],
|
||||
batch_size: int | None = None
|
||||
self, chunks: Iterator[tuple[str, dict]], batch_size: int | None = None
|
||||
) -> Iterator[list[tuple[str, dict]]]:
|
||||
"""
|
||||
Group chunks into batches for efficient processing.
|
||||
@@ -321,7 +318,7 @@ class StreamingIngester:
|
||||
"failed_chunks": self.progress.failed_chunks,
|
||||
"bytes_processed": self.progress.bytes_processed,
|
||||
},
|
||||
"state": state
|
||||
"state": state,
|
||||
}
|
||||
|
||||
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
|
||||
@@ -384,23 +381,25 @@ def main():
|
||||
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
|
||||
parser.add_argument("input", help="Input file or directory path")
|
||||
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
|
||||
parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap in characters")
|
||||
parser.add_argument(
|
||||
"--chunk-overlap", type=int, default=200, help="Chunk overlap in characters"
|
||||
)
|
||||
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
|
||||
parser.add_argument("--checkpoint", help="Checkpoint file path")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize ingester
|
||||
ingester = StreamingIngester(
|
||||
chunk_size=args.chunk_size,
|
||||
chunk_overlap=args.chunk_overlap,
|
||||
batch_size=args.batch_size
|
||||
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size
|
||||
)
|
||||
|
||||
# Progress callback
|
||||
def on_progress(progress: IngestionProgress):
|
||||
if progress.processed_chunks % 10 == 0:
|
||||
print(f"Progress: {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks")
|
||||
print(
|
||||
f"Progress: {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks"
|
||||
)
|
||||
|
||||
# Stream input
|
||||
input_path = Path(args.input)
|
||||
@@ -416,17 +415,23 @@ def main():
|
||||
metadata = {"source": input_path.stem, "file": input_path.name}
|
||||
file_chunks = ingester.chunk_document(content, metadata)
|
||||
# Convert to generator format matching stream_skill_directory
|
||||
chunks = ((text, {
|
||||
"content": text,
|
||||
"chunk_id": meta.chunk_id,
|
||||
"source": meta.source,
|
||||
"category": meta.category,
|
||||
"file": meta.file,
|
||||
"chunk_index": meta.chunk_index,
|
||||
"total_chunks": meta.total_chunks,
|
||||
"char_start": meta.char_start,
|
||||
"char_end": meta.char_end,
|
||||
}) for text, meta in file_chunks)
|
||||
chunks = (
|
||||
(
|
||||
text,
|
||||
{
|
||||
"content": text,
|
||||
"chunk_id": meta.chunk_id,
|
||||
"source": meta.source,
|
||||
"category": meta.category,
|
||||
"file": meta.file,
|
||||
"chunk_index": meta.chunk_index,
|
||||
"total_chunks": meta.total_chunks,
|
||||
"char_start": meta.char_start,
|
||||
"char_end": meta.char_end,
|
||||
},
|
||||
)
|
||||
for text, meta in file_chunks
|
||||
)
|
||||
|
||||
# Process in batches
|
||||
all_chunks = []
|
||||
@@ -437,8 +442,7 @@ def main():
|
||||
# Save checkpoint if specified
|
||||
if args.checkpoint:
|
||||
ingester.save_checkpoint(
|
||||
Path(args.checkpoint),
|
||||
{"processed_batches": len(all_chunks) // args.batch_size}
|
||||
Path(args.checkpoint), {"processed_batches": len(all_chunks) // args.batch_size}
|
||||
)
|
||||
|
||||
# Final progress
|
||||
@@ -449,4 +453,5 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
@@ -22,9 +22,7 @@ def handle_signal(_signum, _frame):
|
||||
def start_command(args):
|
||||
"""Start monitoring."""
|
||||
monitor = SyncMonitor(
|
||||
config_path=args.config,
|
||||
check_interval=args.interval,
|
||||
auto_update=args.auto_update
|
||||
config_path=args.config, check_interval=args.interval, auto_update=args.auto_update
|
||||
)
|
||||
|
||||
# Register signal handlers
|
||||
@@ -42,6 +40,7 @@ def start_command(args):
|
||||
# Keep running
|
||||
while True:
|
||||
import time
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
@@ -53,7 +52,7 @@ def check_command(args):
|
||||
"""Check for changes once."""
|
||||
monitor = SyncMonitor(
|
||||
config_path=args.config,
|
||||
check_interval=3600 # Not used for single check
|
||||
check_interval=3600, # Not used for single check
|
||||
)
|
||||
|
||||
print(f"🔍 Checking {args.config} for changes...")
|
||||
@@ -82,7 +81,7 @@ def check_command(args):
|
||||
print(f" • {change.url}")
|
||||
if change.diff and args.diff:
|
||||
print(f" Diff preview (first 5 lines):")
|
||||
for line in change.diff.split('\n')[:5]:
|
||||
for line in change.diff.split("\n")[:5]:
|
||||
print(f" {line}")
|
||||
|
||||
if report.deleted:
|
||||
@@ -95,10 +94,7 @@ def check_command(args):
|
||||
|
||||
def stats_command(args):
|
||||
"""Show monitoring statistics."""
|
||||
monitor = SyncMonitor(
|
||||
config_path=args.config,
|
||||
check_interval=3600
|
||||
)
|
||||
monitor = SyncMonitor(config_path=args.config, check_interval=3600)
|
||||
|
||||
stats = monitor.stats()
|
||||
|
||||
@@ -117,7 +113,7 @@ def reset_command(args):
|
||||
state_file = Path(f"{args.skill_name}_sync.json")
|
||||
|
||||
if state_file.exists():
|
||||
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == 'y':
|
||||
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == "y":
|
||||
state_file.unlink()
|
||||
print(f"✅ State reset for {args.skill_name}")
|
||||
else:
|
||||
@@ -129,7 +125,7 @@ def reset_command(args):
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Monitor documentation for changes and update skills',
|
||||
description="Monitor documentation for changes and update skills",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -153,52 +149,39 @@ Examples:
|
||||
|
||||
# Reset state
|
||||
skill-seekers-sync reset --skill-name react
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
|
||||
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
|
||||
|
||||
# Start command
|
||||
start_parser = subparsers.add_parser('start', help='Start continuous monitoring')
|
||||
start_parser.add_argument('--config', required=True, help='Path to skill config file')
|
||||
start_parser = subparsers.add_parser("start", help="Start continuous monitoring")
|
||||
start_parser.add_argument("--config", required=True, help="Path to skill config file")
|
||||
start_parser.add_argument(
|
||||
'--interval', '-i',
|
||||
"--interval",
|
||||
"-i",
|
||||
type=int,
|
||||
default=3600,
|
||||
help='Check interval in seconds (default: 3600 = 1 hour)'
|
||||
help="Check interval in seconds (default: 3600 = 1 hour)",
|
||||
)
|
||||
start_parser.add_argument(
|
||||
'--auto-update',
|
||||
action='store_true',
|
||||
help='Automatically rebuild skill on changes'
|
||||
"--auto-update", action="store_true", help="Automatically rebuild skill on changes"
|
||||
)
|
||||
|
||||
# Check command
|
||||
check_parser = subparsers.add_parser('check', help='Check for changes once')
|
||||
check_parser.add_argument('--config', required=True, help='Path to skill config file')
|
||||
check_parser.add_argument(
|
||||
'--diff', '-d',
|
||||
action='store_true',
|
||||
help='Generate content diffs'
|
||||
)
|
||||
check_parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Show detailed output'
|
||||
)
|
||||
check_parser = subparsers.add_parser("check", help="Check for changes once")
|
||||
check_parser.add_argument("--config", required=True, help="Path to skill config file")
|
||||
check_parser.add_argument("--diff", "-d", action="store_true", help="Generate content diffs")
|
||||
check_parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output")
|
||||
|
||||
# Stats command
|
||||
stats_parser = subparsers.add_parser('stats', help='Show monitoring statistics')
|
||||
stats_parser.add_argument('--config', required=True, help='Path to skill config file')
|
||||
stats_parser = subparsers.add_parser("stats", help="Show monitoring statistics")
|
||||
stats_parser.add_argument("--config", required=True, help="Path to skill config file")
|
||||
|
||||
# Reset command
|
||||
reset_parser = subparsers.add_parser('reset', help='Reset monitoring state')
|
||||
reset_parser.add_argument('--skill-name', required=True, help='Skill name')
|
||||
reset_parser.add_argument(
|
||||
'--force', '-f',
|
||||
action='store_true',
|
||||
help='Skip confirmation'
|
||||
)
|
||||
reset_parser = subparsers.add_parser("reset", help="Reset monitoring state")
|
||||
reset_parser.add_argument("--skill-name", required=True, help="Skill name")
|
||||
reset_parser.add_argument("--force", "-f", action="store_true", help="Skip confirmation")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -207,18 +190,18 @@ Examples:
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
if args.command == 'start':
|
||||
if args.command == "start":
|
||||
start_command(args)
|
||||
elif args.command == 'check':
|
||||
elif args.command == "check":
|
||||
check_command(args)
|
||||
elif args.command == 'stats':
|
||||
elif args.command == "stats":
|
||||
stats_command(args)
|
||||
elif args.command == 'reset':
|
||||
elif args.command == "reset":
|
||||
reset_command(args)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -59,7 +59,7 @@ def upload_skill_api(package_path, target="claude", api_key=None, **kwargs):
|
||||
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
|
||||
|
||||
# API key validation only for platforms that require it
|
||||
if target in ['claude', 'gemini', 'openai']:
|
||||
if target in ["claude", "gemini", "openai"]:
|
||||
if not api_key:
|
||||
return False, f"{adaptor.get_env_var_name()} not set. Export your API key first."
|
||||
|
||||
@@ -172,41 +172,39 @@ Examples:
|
||||
# ChromaDB upload options
|
||||
parser.add_argument(
|
||||
"--chroma-url",
|
||||
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
|
||||
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--persist-directory",
|
||||
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
|
||||
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--embedding-function",
|
||||
choices=["openai", "sentence-transformers", "none"],
|
||||
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
|
||||
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--openai-api-key",
|
||||
help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
|
||||
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
|
||||
)
|
||||
|
||||
# Weaviate upload options
|
||||
parser.add_argument(
|
||||
"--weaviate-url",
|
||||
default="http://localhost:8080",
|
||||
help="Weaviate URL (default: http://localhost:8080)"
|
||||
help="Weaviate URL (default: http://localhost:8080)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--use-cloud",
|
||||
action="store_true",
|
||||
help="Use Weaviate Cloud (requires --api-key and --cluster-url)"
|
||||
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cluster-url",
|
||||
help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
|
||||
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -214,28 +212,30 @@ Examples:
|
||||
# Build kwargs for vector DB upload
|
||||
upload_kwargs = {}
|
||||
|
||||
if args.target == 'chroma':
|
||||
if args.target == "chroma":
|
||||
if args.chroma_url:
|
||||
upload_kwargs['chroma_url'] = args.chroma_url
|
||||
upload_kwargs["chroma_url"] = args.chroma_url
|
||||
if args.persist_directory:
|
||||
upload_kwargs['persist_directory'] = args.persist_directory
|
||||
upload_kwargs["persist_directory"] = args.persist_directory
|
||||
if args.embedding_function:
|
||||
upload_kwargs['embedding_function'] = args.embedding_function
|
||||
upload_kwargs["embedding_function"] = args.embedding_function
|
||||
if args.openai_api_key:
|
||||
upload_kwargs['openai_api_key'] = args.openai_api_key
|
||||
upload_kwargs["openai_api_key"] = args.openai_api_key
|
||||
|
||||
elif args.target == 'weaviate':
|
||||
upload_kwargs['weaviate_url'] = args.weaviate_url
|
||||
upload_kwargs['use_cloud'] = args.use_cloud
|
||||
elif args.target == "weaviate":
|
||||
upload_kwargs["weaviate_url"] = args.weaviate_url
|
||||
upload_kwargs["use_cloud"] = args.use_cloud
|
||||
if args.cluster_url:
|
||||
upload_kwargs['cluster_url'] = args.cluster_url
|
||||
upload_kwargs["cluster_url"] = args.cluster_url
|
||||
if args.embedding_function:
|
||||
upload_kwargs['embedding_function'] = args.embedding_function
|
||||
upload_kwargs["embedding_function"] = args.embedding_function
|
||||
if args.openai_api_key:
|
||||
upload_kwargs['openai_api_key'] = args.openai_api_key
|
||||
upload_kwargs["openai_api_key"] = args.openai_api_key
|
||||
|
||||
# Upload skill
|
||||
success, message = upload_skill_api(args.package_file, args.target, args.api_key, **upload_kwargs)
|
||||
success, message = upload_skill_api(
|
||||
args.package_file, args.target, args.api_key, **upload_kwargs
|
||||
)
|
||||
|
||||
if success:
|
||||
sys.exit(0)
|
||||
|
||||
@@ -23,9 +23,9 @@ from .generator import EmbeddingGenerator
|
||||
from .cache import EmbeddingCache
|
||||
|
||||
__all__ = [
|
||||
'EmbeddingRequest',
|
||||
'EmbeddingResponse',
|
||||
'BatchEmbeddingRequest',
|
||||
'EmbeddingGenerator',
|
||||
'EmbeddingCache',
|
||||
"EmbeddingRequest",
|
||||
"EmbeddingResponse",
|
||||
"BatchEmbeddingRequest",
|
||||
"EmbeddingGenerator",
|
||||
"EmbeddingCache",
|
||||
]
|
||||
|
||||
@@ -74,12 +74,7 @@ class EmbeddingCache:
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
def set(
|
||||
self,
|
||||
hash_key: str,
|
||||
embedding: list[float],
|
||||
model: str
|
||||
) -> None:
|
||||
def set(self, hash_key: str, embedding: list[float], model: str) -> None:
|
||||
"""
|
||||
Store embedding in cache.
|
||||
|
||||
@@ -94,11 +89,14 @@ class EmbeddingCache:
|
||||
embedding_json = json.dumps(embedding)
|
||||
dimensions = len(embedding)
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO embeddings
|
||||
(hash, embedding, model, dimensions, created_at, accessed_at, access_count)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 1)
|
||||
""", (hash_key, embedding_json, model, dimensions, now, now))
|
||||
""",
|
||||
(hash_key, embedding_json, model, dimensions, now, now),
|
||||
)
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
@@ -115,11 +113,14 @@ class EmbeddingCache:
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
# Get embedding
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT embedding, created_at
|
||||
FROM embeddings
|
||||
WHERE hash = ?
|
||||
""", (hash_key,))
|
||||
""",
|
||||
(hash_key,),
|
||||
)
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
@@ -136,11 +137,14 @@ class EmbeddingCache:
|
||||
|
||||
# Update access stats
|
||||
now = datetime.utcnow().isoformat()
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
UPDATE embeddings
|
||||
SET accessed_at = ?, access_count = access_count + 1
|
||||
WHERE hash = ?
|
||||
""", (now, hash_key))
|
||||
""",
|
||||
(now, hash_key),
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
return json.loads(embedding_json)
|
||||
@@ -178,11 +182,14 @@ class EmbeddingCache:
|
||||
"""
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT created_at
|
||||
FROM embeddings
|
||||
WHERE hash = ?
|
||||
""", (hash_key,))
|
||||
""",
|
||||
(hash_key,),
|
||||
)
|
||||
|
||||
row = cursor.fetchone()
|
||||
if not row:
|
||||
@@ -206,10 +213,13 @@ class EmbeddingCache:
|
||||
"""
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
DELETE FROM embeddings
|
||||
WHERE hash = ?
|
||||
""", (hash_key,))
|
||||
""",
|
||||
(hash_key,),
|
||||
)
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
@@ -226,10 +236,13 @@ class EmbeddingCache:
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
if model:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
DELETE FROM embeddings
|
||||
WHERE model = ?
|
||||
""", (model,))
|
||||
""",
|
||||
(model,),
|
||||
)
|
||||
else:
|
||||
cursor.execute("DELETE FROM embeddings")
|
||||
|
||||
@@ -249,10 +262,13 @@ class EmbeddingCache:
|
||||
|
||||
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
DELETE FROM embeddings
|
||||
WHERE created_at < ?
|
||||
""", (cutoff,))
|
||||
""",
|
||||
(cutoff,),
|
||||
)
|
||||
|
||||
deleted = cursor.rowcount
|
||||
self.conn.commit()
|
||||
@@ -300,17 +316,19 @@ class EmbeddingCache:
|
||||
LIMIT 10
|
||||
""")
|
||||
top_accessed = [
|
||||
{"hash": row[0], "model": row[1], "access_count": row[2]}
|
||||
for row in cursor.fetchall()
|
||||
{"hash": row[0], "model": row[1], "access_count": row[2]} for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
# Expired entries
|
||||
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT COUNT(*)
|
||||
FROM embeddings
|
||||
WHERE created_at < ?
|
||||
""", (cutoff,))
|
||||
""",
|
||||
(cutoff,),
|
||||
)
|
||||
expired = cursor.fetchone()[0]
|
||||
|
||||
return {
|
||||
@@ -318,7 +336,7 @@ class EmbeddingCache:
|
||||
"by_model": by_model,
|
||||
"top_accessed": top_accessed,
|
||||
"expired": expired,
|
||||
"ttl_days": self.ttl_days
|
||||
"ttl_days": self.ttl_days,
|
||||
}
|
||||
|
||||
def close(self):
|
||||
|
||||
@@ -9,6 +9,7 @@ import numpy as np
|
||||
# OpenAI support
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
OPENAI_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENAI_AVAILABLE = False
|
||||
@@ -16,6 +17,7 @@ except ImportError:
|
||||
# Sentence transformers support
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
SENTENCE_TRANSFORMERS_AVAILABLE = True
|
||||
except ImportError:
|
||||
SENTENCE_TRANSFORMERS_AVAILABLE = False
|
||||
@@ -23,6 +25,7 @@ except ImportError:
|
||||
# Voyage AI support (recommended by Anthropic for embeddings)
|
||||
try:
|
||||
import voyageai
|
||||
|
||||
VOYAGE_AVAILABLE = True
|
||||
except ImportError:
|
||||
VOYAGE_AVAILABLE = False
|
||||
@@ -129,7 +132,7 @@ class EmbeddingGenerator:
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
voyage_api_key: str | None = None,
|
||||
cache_dir: str | None = None
|
||||
cache_dir: str | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize embedding generator.
|
||||
@@ -162,8 +165,7 @@ class EmbeddingGenerator:
|
||||
"""Get information about a model."""
|
||||
if model not in self.MODELS:
|
||||
raise ValueError(
|
||||
f"Unknown model: {model}. "
|
||||
f"Available models: {', '.join(self.MODELS.keys())}"
|
||||
f"Unknown model: {model}. Available models: {', '.join(self.MODELS.keys())}"
|
||||
)
|
||||
return self.MODELS[model]
|
||||
|
||||
@@ -171,20 +173,19 @@ class EmbeddingGenerator:
|
||||
"""List all available models."""
|
||||
models = []
|
||||
for name, info in self.MODELS.items():
|
||||
models.append({
|
||||
"name": name,
|
||||
"provider": info["provider"],
|
||||
"dimensions": info["dimensions"],
|
||||
"max_tokens": info["max_tokens"],
|
||||
"cost_per_million": info.get("cost_per_million", 0.0),
|
||||
})
|
||||
models.append(
|
||||
{
|
||||
"name": name,
|
||||
"provider": info["provider"],
|
||||
"dimensions": info["dimensions"],
|
||||
"max_tokens": info["max_tokens"],
|
||||
"cost_per_million": info.get("cost_per_million", 0.0),
|
||||
}
|
||||
)
|
||||
return models
|
||||
|
||||
def generate(
|
||||
self,
|
||||
text: str,
|
||||
model: str = "text-embedding-3-small",
|
||||
normalize: bool = True
|
||||
self, text: str, model: str = "text-embedding-3-small", normalize: bool = True
|
||||
) -> list[float]:
|
||||
"""
|
||||
Generate embedding for a single text.
|
||||
@@ -218,7 +219,7 @@ class EmbeddingGenerator:
|
||||
texts: list[str],
|
||||
model: str = "text-embedding-3-small",
|
||||
normalize: bool = True,
|
||||
batch_size: int = 32
|
||||
batch_size: int = 32,
|
||||
) -> tuple[list[list[float]], int]:
|
||||
"""
|
||||
Generate embeddings for multiple texts.
|
||||
@@ -248,24 +249,18 @@ class EmbeddingGenerator:
|
||||
else:
|
||||
raise ValueError(f"Unsupported provider: {provider}")
|
||||
|
||||
def _generate_openai(
|
||||
self, text: str, model: str, normalize: bool
|
||||
) -> list[float]:
|
||||
def _generate_openai(self, text: str, model: str, normalize: bool) -> list[float]:
|
||||
"""Generate embedding using OpenAI API."""
|
||||
if not OPENAI_AVAILABLE:
|
||||
raise ImportError(
|
||||
"OpenAI is required for OpenAI embeddings. "
|
||||
"Install with: pip install openai"
|
||||
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
|
||||
)
|
||||
|
||||
if not self.openai_client:
|
||||
raise ValueError("OpenAI API key not provided")
|
||||
|
||||
try:
|
||||
response = self.openai_client.embeddings.create(
|
||||
input=text,
|
||||
model=model
|
||||
)
|
||||
response = self.openai_client.embeddings.create(input=text, model=model)
|
||||
embedding = response.data[0].embedding
|
||||
|
||||
if normalize:
|
||||
@@ -281,8 +276,7 @@ class EmbeddingGenerator:
|
||||
"""Generate embeddings using OpenAI API in batches."""
|
||||
if not OPENAI_AVAILABLE:
|
||||
raise ImportError(
|
||||
"OpenAI is required for OpenAI embeddings. "
|
||||
"Install with: pip install openai"
|
||||
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
|
||||
)
|
||||
|
||||
if not self.openai_client:
|
||||
@@ -292,13 +286,10 @@ class EmbeddingGenerator:
|
||||
|
||||
# Process in batches
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
batch = texts[i : i + batch_size]
|
||||
|
||||
try:
|
||||
response = self.openai_client.embeddings.create(
|
||||
input=batch,
|
||||
model=model
|
||||
)
|
||||
response = self.openai_client.embeddings.create(input=batch, model=model)
|
||||
|
||||
batch_embeddings = [item.embedding for item in response.data]
|
||||
|
||||
@@ -313,24 +304,18 @@ class EmbeddingGenerator:
|
||||
dimensions = len(all_embeddings[0]) if all_embeddings else 0
|
||||
return all_embeddings, dimensions
|
||||
|
||||
def _generate_voyage(
|
||||
self, text: str, model: str, normalize: bool
|
||||
) -> list[float]:
|
||||
def _generate_voyage(self, text: str, model: str, normalize: bool) -> list[float]:
|
||||
"""Generate embedding using Voyage AI API."""
|
||||
if not VOYAGE_AVAILABLE:
|
||||
raise ImportError(
|
||||
"voyageai is required for Voyage AI embeddings. "
|
||||
"Install with: pip install voyageai"
|
||||
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
|
||||
)
|
||||
|
||||
if not self.voyage_client:
|
||||
raise ValueError("Voyage API key not provided")
|
||||
|
||||
try:
|
||||
result = self.voyage_client.embed(
|
||||
texts=[text],
|
||||
model=model
|
||||
)
|
||||
result = self.voyage_client.embed(texts=[text], model=model)
|
||||
embedding = result.embeddings[0]
|
||||
|
||||
if normalize:
|
||||
@@ -346,8 +331,7 @@ class EmbeddingGenerator:
|
||||
"""Generate embeddings using Voyage AI API in batches."""
|
||||
if not VOYAGE_AVAILABLE:
|
||||
raise ImportError(
|
||||
"voyageai is required for Voyage AI embeddings. "
|
||||
"Install with: pip install voyageai"
|
||||
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
|
||||
)
|
||||
|
||||
if not self.voyage_client:
|
||||
@@ -357,13 +341,10 @@ class EmbeddingGenerator:
|
||||
|
||||
# Process in batches (Voyage AI supports up to 128 texts per request)
|
||||
for i in range(0, len(texts), batch_size):
|
||||
batch = texts[i:i + batch_size]
|
||||
batch = texts[i : i + batch_size]
|
||||
|
||||
try:
|
||||
result = self.voyage_client.embed(
|
||||
texts=batch,
|
||||
model=model
|
||||
)
|
||||
result = self.voyage_client.embed(texts=batch, model=model)
|
||||
|
||||
batch_embeddings = result.embeddings
|
||||
|
||||
@@ -378,9 +359,7 @@ class EmbeddingGenerator:
|
||||
dimensions = len(all_embeddings[0]) if all_embeddings else 0
|
||||
return all_embeddings, dimensions
|
||||
|
||||
def _generate_sentence_transformer(
|
||||
self, text: str, model: str, normalize: bool
|
||||
) -> list[float]:
|
||||
def _generate_sentence_transformer(self, text: str, model: str, normalize: bool) -> list[float]:
|
||||
"""Generate embedding using sentence-transformers."""
|
||||
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
||||
raise ImportError(
|
||||
@@ -417,10 +396,7 @@ class EmbeddingGenerator:
|
||||
|
||||
# Generate embeddings in batches
|
||||
embeddings = st_model.encode(
|
||||
texts,
|
||||
batch_size=batch_size,
|
||||
normalize_embeddings=normalize,
|
||||
show_progress_bar=False
|
||||
texts, batch_size=batch_size, normalize_embeddings=normalize, show_progress_bar=False
|
||||
)
|
||||
|
||||
dimensions = len(embeddings[0]) if len(embeddings) > 0 else 0
|
||||
|
||||
@@ -14,20 +14,14 @@ class EmbeddingRequest(BaseModel):
|
||||
"example": {
|
||||
"text": "This is a test document about Python programming.",
|
||||
"model": "text-embedding-3-small",
|
||||
"normalize": True
|
||||
"normalize": True,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
text: str = Field(..., description="Text to generate embedding for")
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description="Embedding model to use"
|
||||
)
|
||||
normalize: bool = Field(
|
||||
default=True,
|
||||
description="Normalize embeddings to unit length"
|
||||
)
|
||||
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
|
||||
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
|
||||
|
||||
|
||||
class BatchEmbeddingRequest(BaseModel):
|
||||
@@ -39,27 +33,20 @@ class BatchEmbeddingRequest(BaseModel):
|
||||
"texts": [
|
||||
"First document about Python",
|
||||
"Second document about JavaScript",
|
||||
"Third document about Rust"
|
||||
"Third document about Rust",
|
||||
],
|
||||
"model": "text-embedding-3-small",
|
||||
"normalize": True,
|
||||
"batch_size": 32
|
||||
"batch_size": 32,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
texts: list[str] = Field(..., description="List of texts to embed")
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description="Embedding model to use"
|
||||
)
|
||||
normalize: bool = Field(
|
||||
default=True,
|
||||
description="Normalize embeddings to unit length"
|
||||
)
|
||||
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
|
||||
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
|
||||
batch_size: int | None = Field(
|
||||
default=32,
|
||||
description="Batch size for processing (default: 32)"
|
||||
default=32, description="Batch size for processing (default: 32)"
|
||||
)
|
||||
|
||||
|
||||
@@ -69,10 +56,7 @@ class EmbeddingResponse(BaseModel):
|
||||
embedding: list[float] = Field(..., description="Generated embedding vector")
|
||||
model: str = Field(..., description="Model used for generation")
|
||||
dimensions: int = Field(..., description="Embedding dimensions")
|
||||
cached: bool = Field(
|
||||
default=False,
|
||||
description="Whether embedding was retrieved from cache"
|
||||
)
|
||||
cached: bool = Field(default=False, description="Whether embedding was retrieved from cache")
|
||||
|
||||
|
||||
class BatchEmbeddingResponse(BaseModel):
|
||||
@@ -82,10 +66,7 @@ class BatchEmbeddingResponse(BaseModel):
|
||||
model: str = Field(..., description="Model used for generation")
|
||||
dimensions: int = Field(..., description="Embedding dimensions")
|
||||
count: int = Field(..., description="Number of embeddings generated")
|
||||
cached_count: int = Field(
|
||||
default=0,
|
||||
description="Number of embeddings retrieved from cache"
|
||||
)
|
||||
cached_count: int = Field(default=0, description="Number of embeddings retrieved from cache")
|
||||
|
||||
|
||||
class SkillEmbeddingRequest(BaseModel):
|
||||
@@ -97,24 +78,15 @@ class SkillEmbeddingRequest(BaseModel):
|
||||
"skill_path": "/path/to/skill/react",
|
||||
"model": "text-embedding-3-small",
|
||||
"chunk_size": 512,
|
||||
"overlap": 50
|
||||
"overlap": 50,
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
skill_path: str = Field(..., description="Path to skill directory")
|
||||
model: str = Field(
|
||||
default="text-embedding-3-small",
|
||||
description="Embedding model to use"
|
||||
)
|
||||
chunk_size: int = Field(
|
||||
default=512,
|
||||
description="Chunk size for splitting documents (tokens)"
|
||||
)
|
||||
overlap: int = Field(
|
||||
default=50,
|
||||
description="Overlap between chunks (tokens)"
|
||||
)
|
||||
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
|
||||
chunk_size: int = Field(default=512, description="Chunk size for splitting documents (tokens)")
|
||||
overlap: int = Field(default=50, description="Overlap between chunks (tokens)")
|
||||
|
||||
|
||||
class SkillEmbeddingResponse(BaseModel):
|
||||
@@ -124,10 +96,7 @@ class SkillEmbeddingResponse(BaseModel):
|
||||
total_chunks: int = Field(..., description="Total number of chunks embedded")
|
||||
model: str = Field(..., description="Model used for generation")
|
||||
dimensions: int = Field(..., description="Embedding dimensions")
|
||||
metadata: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Skill metadata"
|
||||
)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict, description="Skill metadata")
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
@@ -144,12 +113,13 @@ class ModelInfo(BaseModel):
|
||||
"""Information about an embedding model."""
|
||||
|
||||
name: str = Field(..., description="Model name")
|
||||
provider: str = Field(..., description="Model provider (openai, anthropic, sentence-transformers)")
|
||||
provider: str = Field(
|
||||
..., description="Model provider (openai, anthropic, sentence-transformers)"
|
||||
)
|
||||
dimensions: int = Field(..., description="Embedding dimensions")
|
||||
max_tokens: int = Field(..., description="Maximum input tokens")
|
||||
cost_per_million: float | None = Field(
|
||||
None,
|
||||
description="Cost per million tokens (if applicable)"
|
||||
None, description="Cost per million tokens (if applicable)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ try:
|
||||
from fastapi import FastAPI, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import uvicorn
|
||||
|
||||
FASTAPI_AVAILABLE = True
|
||||
except ImportError:
|
||||
FASTAPI_AVAILABLE = False
|
||||
@@ -51,7 +52,7 @@ if FASTAPI_AVAILABLE:
|
||||
description="Generate embeddings for text and skill content",
|
||||
version="1.0.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc"
|
||||
redoc_url="/redoc",
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
@@ -64,13 +65,14 @@ if FASTAPI_AVAILABLE:
|
||||
)
|
||||
|
||||
# Initialize generator and cache
|
||||
cache_dir = os.getenv("EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings"))
|
||||
cache_dir = os.getenv(
|
||||
"EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings")
|
||||
)
|
||||
cache_db = os.path.join(cache_dir, "embeddings.db")
|
||||
cache_enabled = os.getenv("EMBEDDING_CACHE_ENABLED", "true").lower() == "true"
|
||||
|
||||
generator = EmbeddingGenerator(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
voyage_api_key=os.getenv("VOYAGE_API_KEY")
|
||||
api_key=os.getenv("OPENAI_API_KEY"), voyage_api_key=os.getenv("VOYAGE_API_KEY")
|
||||
)
|
||||
cache = EmbeddingCache(cache_db) if cache_enabled else None
|
||||
|
||||
@@ -81,7 +83,7 @@ if FASTAPI_AVAILABLE:
|
||||
"service": "Skill Seekers Embedding API",
|
||||
"version": "1.0.0",
|
||||
"docs": "/docs",
|
||||
"health": "/health"
|
||||
"health": "/health",
|
||||
}
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
@@ -95,7 +97,7 @@ if FASTAPI_AVAILABLE:
|
||||
version="1.0.0",
|
||||
models=models,
|
||||
cache_enabled=cache_enabled,
|
||||
cache_size=cache_size
|
||||
cache_size=cache_size,
|
||||
)
|
||||
|
||||
@app.get("/models", response_model=ModelsResponse)
|
||||
@@ -109,15 +111,12 @@ if FASTAPI_AVAILABLE:
|
||||
provider=m["provider"],
|
||||
dimensions=m["dimensions"],
|
||||
max_tokens=m["max_tokens"],
|
||||
cost_per_million=m.get("cost_per_million")
|
||||
cost_per_million=m.get("cost_per_million"),
|
||||
)
|
||||
for m in models_list
|
||||
]
|
||||
|
||||
return ModelsResponse(
|
||||
models=model_infos,
|
||||
count=len(model_infos)
|
||||
)
|
||||
return ModelsResponse(models=model_infos, count=len(model_infos))
|
||||
|
||||
@app.post("/embed", response_model=EmbeddingResponse)
|
||||
async def embed_text(request: EmbeddingRequest):
|
||||
@@ -144,9 +143,7 @@ if FASTAPI_AVAILABLE:
|
||||
else:
|
||||
# Generate embedding
|
||||
embedding = generator.generate(
|
||||
request.text,
|
||||
model=request.model,
|
||||
normalize=request.normalize
|
||||
request.text, model=request.model, normalize=request.normalize
|
||||
)
|
||||
|
||||
# Store in cache
|
||||
@@ -154,10 +151,7 @@ if FASTAPI_AVAILABLE:
|
||||
cache.set(hash_key, embedding, request.model)
|
||||
|
||||
return EmbeddingResponse(
|
||||
embedding=embedding,
|
||||
model=request.model,
|
||||
dimensions=len(embedding),
|
||||
cached=cached
|
||||
embedding=embedding, model=request.model, dimensions=len(embedding), cached=cached
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -202,11 +196,13 @@ if FASTAPI_AVAILABLE:
|
||||
texts_to_generate,
|
||||
model=request.model,
|
||||
normalize=request.normalize,
|
||||
batch_size=request.batch_size
|
||||
batch_size=request.batch_size,
|
||||
)
|
||||
|
||||
# Fill in placeholders and cache
|
||||
for idx, text, embedding in zip(text_indices, texts_to_generate, generated_embeddings, strict=False):
|
||||
for idx, text, embedding in zip(
|
||||
text_indices, texts_to_generate, generated_embeddings, strict=False
|
||||
):
|
||||
embeddings[idx] = embedding
|
||||
|
||||
if cache:
|
||||
@@ -220,7 +216,7 @@ if FASTAPI_AVAILABLE:
|
||||
model=request.model,
|
||||
dimensions=dimensions,
|
||||
count=len(embeddings),
|
||||
cached_count=cached_count
|
||||
cached_count=cached_count,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -244,12 +240,16 @@ if FASTAPI_AVAILABLE:
|
||||
skill_path = Path(request.skill_path)
|
||||
|
||||
if not skill_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"Skill path not found: {request.skill_path}")
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Skill path not found: {request.skill_path}"
|
||||
)
|
||||
|
||||
# Read SKILL.md
|
||||
skill_md = skill_path / "SKILL.md"
|
||||
if not skill_md.exists():
|
||||
raise HTTPException(status_code=404, detail=f"SKILL.md not found in {request.skill_path}")
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"SKILL.md not found in {request.skill_path}"
|
||||
)
|
||||
|
||||
skill_content = skill_md.read_text()
|
||||
|
||||
@@ -262,10 +262,7 @@ if FASTAPI_AVAILABLE:
|
||||
|
||||
# Generate embeddings for chunks
|
||||
embeddings, dimensions = generator.generate_batch(
|
||||
chunks,
|
||||
model=request.model,
|
||||
normalize=True,
|
||||
batch_size=32
|
||||
chunks, model=request.model, normalize=True, batch_size=32
|
||||
)
|
||||
|
||||
# TODO: Store embeddings in vector database
|
||||
@@ -279,8 +276,8 @@ if FASTAPI_AVAILABLE:
|
||||
metadata={
|
||||
"skill_path": str(skill_path),
|
||||
"chunks": len(chunks),
|
||||
"content_length": len(skill_content)
|
||||
}
|
||||
"content_length": len(skill_content),
|
||||
},
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
@@ -298,7 +295,7 @@ if FASTAPI_AVAILABLE:
|
||||
|
||||
@app.post("/cache/clear", response_model=dict)
|
||||
async def clear_cache(
|
||||
model: str | None = Query(None, description="Model to clear (all if not specified)")
|
||||
model: str | None = Query(None, description="Model to clear (all if not specified)"),
|
||||
):
|
||||
"""Clear cache entries."""
|
||||
if not cache:
|
||||
@@ -306,11 +303,7 @@ if FASTAPI_AVAILABLE:
|
||||
|
||||
deleted = cache.clear(model=model)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"deleted": deleted,
|
||||
"model": model or "all"
|
||||
}
|
||||
return {"status": "ok", "deleted": deleted, "model": model or "all"}
|
||||
|
||||
@app.post("/cache/clear-expired", response_model=dict)
|
||||
async def clear_expired():
|
||||
@@ -320,10 +313,7 @@ if FASTAPI_AVAILABLE:
|
||||
|
||||
deleted = cache.clear_expired()
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"deleted": deleted
|
||||
}
|
||||
return {"status": "ok", "deleted": deleted}
|
||||
|
||||
else:
|
||||
print("Error: FastAPI not available. Install with: pip install fastapi uvicorn")
|
||||
@@ -348,12 +338,7 @@ def main():
|
||||
if cache_enabled:
|
||||
print(f"💾 Cache database: {cache_db}")
|
||||
|
||||
uvicorn.run(
|
||||
"skill_seekers.embedding.server:app",
|
||||
host=host,
|
||||
port=port,
|
||||
reload=reload
|
||||
)
|
||||
uvicorn.run("skill_seekers.embedding.server:app", host=host, port=port, reload=reload)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -69,15 +69,17 @@ async def generate_config(args: dict) -> list[TextContent]:
|
||||
config = {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"sources": [{
|
||||
"type": "documentation",
|
||||
"base_url": url,
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
"categories": {},
|
||||
"rate_limit": rate_limit,
|
||||
"max_pages": max_pages,
|
||||
}],
|
||||
"sources": [
|
||||
{
|
||||
"type": "documentation",
|
||||
"base_url": url,
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
"categories": {},
|
||||
"rate_limit": rate_limit,
|
||||
"max_pages": max_pages,
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
# Save to configs directory
|
||||
|
||||
@@ -32,9 +32,9 @@ from .detector import ChangeDetector
|
||||
from .models import SyncConfig, ChangeReport, PageChange
|
||||
|
||||
__all__ = [
|
||||
'SyncMonitor',
|
||||
'ChangeDetector',
|
||||
'SyncConfig',
|
||||
'ChangeReport',
|
||||
'PageChange',
|
||||
"SyncMonitor",
|
||||
"ChangeDetector",
|
||||
"SyncConfig",
|
||||
"ChangeReport",
|
||||
"PageChange",
|
||||
]
|
||||
|
||||
@@ -55,7 +55,7 @@ class ChangeDetector:
|
||||
Returns:
|
||||
Hexadecimal hash string
|
||||
"""
|
||||
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def fetch_page(self, url: str) -> tuple[str, dict[str, str]]:
|
||||
"""
|
||||
@@ -72,17 +72,15 @@ class ChangeDetector:
|
||||
requests.RequestException: If fetch fails
|
||||
"""
|
||||
response = requests.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
|
||||
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
metadata = {
|
||||
'last-modified': response.headers.get('Last-Modified'),
|
||||
'etag': response.headers.get('ETag'),
|
||||
'content-type': response.headers.get('Content-Type'),
|
||||
'content-length': response.headers.get('Content-Length'),
|
||||
"last-modified": response.headers.get("Last-Modified"),
|
||||
"etag": response.headers.get("ETag"),
|
||||
"content-type": response.headers.get("Content-Type"),
|
||||
"content-length": response.headers.get("Content-Length"),
|
||||
}
|
||||
|
||||
return response.text, metadata
|
||||
@@ -92,7 +90,7 @@ class ChangeDetector:
|
||||
url: str,
|
||||
old_hash: str | None = None,
|
||||
generate_diff: bool = False,
|
||||
old_content: str | None = None
|
||||
old_content: str | None = None,
|
||||
) -> PageChange:
|
||||
"""
|
||||
Check if page has changed.
|
||||
@@ -132,7 +130,7 @@ class ChangeDetector:
|
||||
old_hash=old_hash,
|
||||
new_hash=new_hash,
|
||||
diff=diff,
|
||||
detected_at=datetime.utcnow()
|
||||
detected_at=datetime.utcnow(),
|
||||
)
|
||||
|
||||
except requests.RequestException:
|
||||
@@ -142,14 +140,11 @@ class ChangeDetector:
|
||||
change_type=ChangeType.DELETED,
|
||||
old_hash=old_hash,
|
||||
new_hash=None,
|
||||
detected_at=datetime.utcnow()
|
||||
detected_at=datetime.utcnow(),
|
||||
)
|
||||
|
||||
def check_pages(
|
||||
self,
|
||||
urls: list[str],
|
||||
previous_hashes: dict[str, str],
|
||||
generate_diffs: bool = False
|
||||
self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False
|
||||
) -> ChangeReport:
|
||||
"""
|
||||
Check multiple pages for changes.
|
||||
@@ -185,13 +180,15 @@ class ChangeDetector:
|
||||
# Check for deleted pages (in previous state but not in current)
|
||||
for url, old_hash in previous_hashes.items():
|
||||
if url not in checked_urls:
|
||||
deleted.append(PageChange(
|
||||
url=url,
|
||||
change_type=ChangeType.DELETED,
|
||||
old_hash=old_hash,
|
||||
new_hash=None,
|
||||
detected_at=datetime.utcnow()
|
||||
))
|
||||
deleted.append(
|
||||
PageChange(
|
||||
url=url,
|
||||
change_type=ChangeType.DELETED,
|
||||
old_hash=old_hash,
|
||||
new_hash=None,
|
||||
detected_at=datetime.utcnow(),
|
||||
)
|
||||
)
|
||||
|
||||
return ChangeReport(
|
||||
skill_name="unknown", # To be set by caller
|
||||
@@ -200,7 +197,7 @@ class ChangeDetector:
|
||||
modified=modified,
|
||||
deleted=deleted,
|
||||
unchanged=unchanged_count,
|
||||
checked_at=datetime.utcnow()
|
||||
checked_at=datetime.utcnow(),
|
||||
)
|
||||
|
||||
def generate_diff(self, old_content: str, new_content: str) -> str:
|
||||
@@ -217,15 +214,9 @@ class ChangeDetector:
|
||||
old_lines = old_content.splitlines(keepends=True)
|
||||
new_lines = new_content.splitlines(keepends=True)
|
||||
|
||||
diff = difflib.unified_diff(
|
||||
old_lines,
|
||||
new_lines,
|
||||
fromfile='old',
|
||||
tofile='new',
|
||||
lineterm=''
|
||||
)
|
||||
diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="")
|
||||
|
||||
return ''.join(diff)
|
||||
return "".join(diff)
|
||||
|
||||
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
|
||||
"""
|
||||
@@ -244,16 +235,15 @@ class ChangeDetector:
|
||||
diff = difflib.unified_diff(old_lines, new_lines)
|
||||
diff_lines = list(diff)
|
||||
|
||||
added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
|
||||
removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
|
||||
added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++"))
|
||||
removed = sum(
|
||||
1 for line in diff_lines if line.startswith("-") and not line.startswith("---")
|
||||
)
|
||||
|
||||
return f"+{added} -{removed} lines"
|
||||
|
||||
def check_header_changes(
|
||||
self,
|
||||
url: str,
|
||||
old_modified: str | None = None,
|
||||
old_etag: str | None = None
|
||||
self, url: str, old_modified: str | None = None, old_etag: str | None = None
|
||||
) -> bool:
|
||||
"""
|
||||
Quick check using HTTP headers (no content download).
|
||||
@@ -269,14 +259,12 @@ class ChangeDetector:
|
||||
try:
|
||||
# Use HEAD request for efficiency
|
||||
response = requests.head(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
|
||||
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
new_modified = response.headers.get('Last-Modified')
|
||||
new_etag = response.headers.get('ETag')
|
||||
new_modified = response.headers.get("Last-Modified")
|
||||
new_etag = response.headers.get("ETag")
|
||||
|
||||
# Check if headers indicate change
|
||||
if old_modified and new_modified and old_modified != new_modified:
|
||||
@@ -289,9 +277,7 @@ class ChangeDetector:
|
||||
return True
|
||||
|
||||
def batch_check_headers(
|
||||
self,
|
||||
urls: list[str],
|
||||
previous_metadata: dict[str, dict[str, str]]
|
||||
self, urls: list[str], previous_metadata: dict[str, dict[str, str]]
|
||||
) -> list[str]:
|
||||
"""
|
||||
Batch check URLs using headers only.
|
||||
@@ -307,8 +293,8 @@ class ChangeDetector:
|
||||
|
||||
for url in urls:
|
||||
old_meta = previous_metadata.get(url, {})
|
||||
old_modified = old_meta.get('last-modified')
|
||||
old_etag = old_meta.get('etag')
|
||||
old_modified = old_meta.get("last-modified")
|
||||
old_etag = old_meta.get("etag")
|
||||
|
||||
if self.check_header_changes(url, old_modified, old_etag):
|
||||
changed_urls.append(url)
|
||||
|
||||
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
|
||||
|
||||
class ChangeType(str, Enum):
|
||||
"""Type of change detected."""
|
||||
|
||||
ADDED = "added"
|
||||
MODIFIED = "modified"
|
||||
DELETED = "deleted"
|
||||
@@ -25,8 +26,7 @@ class PageChange(BaseModel):
|
||||
new_hash: str | None = Field(None, description="New content hash")
|
||||
diff: str | None = Field(None, description="Content diff (if available)")
|
||||
detected_at: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="When change was detected"
|
||||
default_factory=datetime.utcnow, description="When change was detected"
|
||||
)
|
||||
|
||||
class Config:
|
||||
@@ -37,7 +37,7 @@ class PageChange(BaseModel):
|
||||
"old_hash": "abc123",
|
||||
"new_hash": "def456",
|
||||
"diff": "@@ -10,3 +10,4 @@\n+New content here",
|
||||
"detected_at": "2024-01-15T10:30:00Z"
|
||||
"detected_at": "2024-01-15T10:30:00Z",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,8 +52,7 @@ class ChangeReport(BaseModel):
|
||||
deleted: list[PageChange] = Field(default_factory=list, description="Deleted pages")
|
||||
unchanged: int = Field(0, description="Number of unchanged pages")
|
||||
checked_at: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="When check was performed"
|
||||
default_factory=datetime.utcnow, description="When check was performed"
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -72,34 +71,19 @@ class SyncConfig(BaseModel):
|
||||
|
||||
skill_config: str = Field(..., description="Path to skill config file")
|
||||
check_interval: int = Field(
|
||||
default=3600,
|
||||
description="Check interval in seconds (default: 1 hour)"
|
||||
default=3600, description="Check interval in seconds (default: 1 hour)"
|
||||
)
|
||||
enabled: bool = Field(default=True, description="Whether sync is enabled")
|
||||
auto_update: bool = Field(
|
||||
default=False,
|
||||
description="Automatically rebuild skill on changes"
|
||||
)
|
||||
notify_on_change: bool = Field(
|
||||
default=True,
|
||||
description="Send notifications on changes"
|
||||
)
|
||||
auto_update: bool = Field(default=False, description="Automatically rebuild skill on changes")
|
||||
notify_on_change: bool = Field(default=True, description="Send notifications on changes")
|
||||
notification_channels: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Notification channels (email, slack, webhook)"
|
||||
)
|
||||
webhook_url: str | None = Field(
|
||||
None,
|
||||
description="Webhook URL for change notifications"
|
||||
default_factory=list, description="Notification channels (email, slack, webhook)"
|
||||
)
|
||||
webhook_url: str | None = Field(None, description="Webhook URL for change notifications")
|
||||
email_recipients: list[str] = Field(
|
||||
default_factory=list,
|
||||
description="Email recipients for notifications"
|
||||
)
|
||||
slack_webhook: str | None = Field(
|
||||
None,
|
||||
description="Slack webhook URL"
|
||||
default_factory=list, description="Email recipients for notifications"
|
||||
)
|
||||
slack_webhook: str | None = Field(None, description="Slack webhook URL")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
@@ -111,7 +95,7 @@ class SyncConfig(BaseModel):
|
||||
"notify_on_change": True,
|
||||
"notification_channels": ["slack", "webhook"],
|
||||
"webhook_url": "https://example.com/webhook",
|
||||
"slack_webhook": "https://hooks.slack.com/services/..."
|
||||
"slack_webhook": "https://hooks.slack.com/services/...",
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,8 +109,7 @@ class SyncState(BaseModel):
|
||||
total_checks: int = Field(default=0, description="Total checks performed")
|
||||
total_changes: int = Field(default=0, description="Total changes detected")
|
||||
page_hashes: dict[str, str] = Field(
|
||||
default_factory=dict,
|
||||
description="URL -> content hash mapping"
|
||||
default_factory=dict, description="URL -> content hash mapping"
|
||||
)
|
||||
status: str = Field(default="idle", description="Current status")
|
||||
error: str | None = Field(None, description="Last error message")
|
||||
@@ -137,15 +120,9 @@ class WebhookPayload(BaseModel):
|
||||
|
||||
event: str = Field(..., description="Event type (change_detected, sync_complete)")
|
||||
skill_name: str = Field(..., description="Skill name")
|
||||
timestamp: datetime = Field(
|
||||
default_factory=datetime.utcnow,
|
||||
description="Event timestamp"
|
||||
)
|
||||
timestamp: datetime = Field(default_factory=datetime.utcnow, description="Event timestamp")
|
||||
changes: ChangeReport | None = Field(None, description="Change report")
|
||||
metadata: dict[str, Any] = Field(
|
||||
default_factory=dict,
|
||||
description="Additional metadata"
|
||||
)
|
||||
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
@@ -157,8 +134,8 @@ class WebhookPayload(BaseModel):
|
||||
"total_pages": 150,
|
||||
"added": [],
|
||||
"modified": [{"url": "https://react.dev/learn"}],
|
||||
"deleted": []
|
||||
"deleted": [],
|
||||
},
|
||||
"metadata": {"source": "periodic_check"}
|
||||
"metadata": {"source": "periodic_check"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ class SyncMonitor:
|
||||
check_interval: int = 3600,
|
||||
auto_update: bool = False,
|
||||
state_file: str | None = None,
|
||||
on_change: Callable[[ChangeReport], None] | None = None
|
||||
on_change: Callable[[ChangeReport], None] | None = None,
|
||||
):
|
||||
"""
|
||||
Initialize sync monitor.
|
||||
@@ -72,7 +72,7 @@ class SyncMonitor:
|
||||
with open(self.config_path) as f:
|
||||
self.skill_config = json.load(f)
|
||||
|
||||
self.skill_name = self.skill_config.get('name', 'unknown')
|
||||
self.skill_name = self.skill_config.get("name", "unknown")
|
||||
|
||||
# State file
|
||||
if state_file:
|
||||
@@ -97,10 +97,10 @@ class SyncMonitor:
|
||||
with open(self.state_file) as f:
|
||||
data = json.load(f)
|
||||
# Convert datetime strings back
|
||||
if data.get('last_check'):
|
||||
data['last_check'] = datetime.fromisoformat(data['last_check'])
|
||||
if data.get('last_change'):
|
||||
data['last_change'] = datetime.fromisoformat(data['last_change'])
|
||||
if data.get("last_check"):
|
||||
data["last_check"] = datetime.fromisoformat(data["last_check"])
|
||||
if data.get("last_change"):
|
||||
data["last_change"] = datetime.fromisoformat(data["last_change"])
|
||||
return SyncState(**data)
|
||||
else:
|
||||
return SyncState(skill_name=self.skill_name)
|
||||
@@ -109,12 +109,12 @@ class SyncMonitor:
|
||||
"""Save current state to file."""
|
||||
# Convert datetime to ISO format
|
||||
data = self.state.dict()
|
||||
if data.get('last_check'):
|
||||
data['last_check'] = data['last_check'].isoformat()
|
||||
if data.get('last_change'):
|
||||
data['last_change'] = data['last_change'].isoformat()
|
||||
if data.get("last_check"):
|
||||
data["last_check"] = data["last_check"].isoformat()
|
||||
if data.get("last_change"):
|
||||
data["last_change"] = data["last_change"].isoformat()
|
||||
|
||||
with open(self.state_file, 'w') as f:
|
||||
with open(self.state_file, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
def check_now(self, generate_diffs: bool = False) -> ChangeReport:
|
||||
@@ -132,7 +132,7 @@ class SyncMonitor:
|
||||
|
||||
try:
|
||||
# Get URLs to check from config
|
||||
base_url = self.skill_config.get('base_url')
|
||||
base_url = self.skill_config.get("base_url")
|
||||
# TODO: In real implementation, get actual URLs from scraper
|
||||
|
||||
# For now, simulate with base URL only
|
||||
@@ -140,9 +140,7 @@ class SyncMonitor:
|
||||
|
||||
# Check for changes
|
||||
report = self.detector.check_pages(
|
||||
urls=urls,
|
||||
previous_hashes=self.state.page_hashes,
|
||||
generate_diffs=generate_diffs
|
||||
urls=urls, previous_hashes=self.state.page_hashes, generate_diffs=generate_diffs
|
||||
)
|
||||
report.skill_name = self.skill_name
|
||||
|
||||
@@ -192,7 +190,7 @@ class SyncMonitor:
|
||||
event="change_detected",
|
||||
skill_name=self.skill_name,
|
||||
changes=report,
|
||||
metadata={"auto_update": self.auto_update}
|
||||
metadata={"auto_update": self.auto_update},
|
||||
)
|
||||
|
||||
self.notifier.send(payload)
|
||||
@@ -214,9 +212,7 @@ class SyncMonitor:
|
||||
self._running = True
|
||||
|
||||
# Schedule checks
|
||||
schedule.every(self.check_interval).seconds.do(
|
||||
lambda: self.check_now()
|
||||
)
|
||||
schedule.every(self.check_interval).seconds.do(lambda: self.check_now())
|
||||
|
||||
# Run in thread
|
||||
def run_schedule():
|
||||
|
||||
@@ -34,7 +34,7 @@ class Notifier:
|
||||
webhook_url: str | None = None,
|
||||
slack_webhook: str | None = None,
|
||||
email_recipients: list[str] | None = None,
|
||||
console: bool = True
|
||||
console: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize notifier.
|
||||
@@ -45,8 +45,8 @@ class Notifier:
|
||||
email_recipients: List of email recipients
|
||||
console: Whether to print to console
|
||||
"""
|
||||
self.webhook_url = webhook_url or os.getenv('SYNC_WEBHOOK_URL')
|
||||
self.slack_webhook = slack_webhook or os.getenv('SLACK_WEBHOOK_URL')
|
||||
self.webhook_url = webhook_url or os.getenv("SYNC_WEBHOOK_URL")
|
||||
self.slack_webhook = slack_webhook or os.getenv("SLACK_WEBHOOK_URL")
|
||||
self.email_recipients = email_recipients or []
|
||||
self.console = console
|
||||
|
||||
@@ -92,8 +92,8 @@ class Notifier:
|
||||
response = requests.post(
|
||||
self.webhook_url,
|
||||
json=payload.dict(),
|
||||
headers={'Content-Type': 'application/json'},
|
||||
timeout=10
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
print(f"✅ Webhook notification sent to {self.webhook_url}")
|
||||
@@ -124,14 +124,10 @@ class Notifier:
|
||||
slack_payload = {
|
||||
"text": text,
|
||||
"username": "Skill Seekers Sync",
|
||||
"icon_emoji": ":books:"
|
||||
"icon_emoji": ":books:",
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.slack_webhook,
|
||||
json=slack_payload,
|
||||
timeout=10
|
||||
)
|
||||
response = requests.post(self.slack_webhook, json=slack_payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
print("✅ Slack notification sent")
|
||||
except Exception as e:
|
||||
|
||||
@@ -85,9 +85,17 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
|
||||
# Platforms to benchmark
|
||||
platforms = [
|
||||
"claude", "gemini", "openai", "markdown", # IDE integrations
|
||||
"langchain", "llama-index", "haystack", # RAG frameworks
|
||||
"weaviate", "chroma", "faiss", "qdrant" # Vector DBs
|
||||
"claude",
|
||||
"gemini",
|
||||
"openai",
|
||||
"markdown", # IDE integrations
|
||||
"langchain",
|
||||
"llama-index",
|
||||
"haystack", # RAG frameworks
|
||||
"weaviate",
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant", # Vector DBs
|
||||
]
|
||||
|
||||
results = {}
|
||||
@@ -115,20 +123,19 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
min_time = min(times)
|
||||
max_time = max(times)
|
||||
|
||||
results[platform] = {
|
||||
"avg": avg_time,
|
||||
"min": min_time,
|
||||
"max": max_time
|
||||
}
|
||||
results[platform] = {"avg": avg_time, "min": min_time, "max": max_time}
|
||||
|
||||
print(f"{platform:15} - Avg: {avg_time*1000:6.2f}ms | "
|
||||
f"Min: {min_time*1000:6.2f}ms | Max: {max_time*1000:6.2f}ms")
|
||||
print(
|
||||
f"{platform:15} - Avg: {avg_time * 1000:6.2f}ms | "
|
||||
f"Min: {min_time * 1000:6.2f}ms | Max: {max_time * 1000:6.2f}ms"
|
||||
)
|
||||
|
||||
# Performance assertions (should complete in reasonable time)
|
||||
for platform, metrics in results.items():
|
||||
self.assertLess(
|
||||
metrics["avg"], 0.5, # Should average < 500ms
|
||||
f"{platform} format_skill_md too slow: {metrics['avg']*1000:.2f}ms"
|
||||
metrics["avg"],
|
||||
0.5, # Should average < 500ms
|
||||
f"{platform} format_skill_md too slow: {metrics['avg'] * 1000:.2f}ms",
|
||||
)
|
||||
|
||||
def test_benchmark_package_operations(self):
|
||||
@@ -158,12 +165,9 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
# Get file size
|
||||
file_size_kb = package_path.stat().st_size / 1024
|
||||
|
||||
results[platform] = {
|
||||
"time": elapsed,
|
||||
"size_kb": file_size_kb
|
||||
}
|
||||
results[platform] = {"time": elapsed, "size_kb": file_size_kb}
|
||||
|
||||
print(f"{platform:15} - Time: {elapsed*1000:7.2f}ms | Size: {file_size_kb:7.1f} KB")
|
||||
print(f"{platform:15} - Time: {elapsed * 1000:7.2f}ms | Size: {file_size_kb:7.1f} KB")
|
||||
|
||||
# Validate output
|
||||
self.assertTrue(package_path.exists())
|
||||
@@ -171,12 +175,14 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
# Performance assertions
|
||||
for platform, metrics in results.items():
|
||||
self.assertLess(
|
||||
metrics["time"], 1.0, # Should complete < 1 second
|
||||
f"{platform} packaging too slow: {metrics['time']*1000:.2f}ms"
|
||||
metrics["time"],
|
||||
1.0, # Should complete < 1 second
|
||||
f"{platform} packaging too slow: {metrics['time'] * 1000:.2f}ms",
|
||||
)
|
||||
self.assertLess(
|
||||
metrics["size_kb"], 1000, # Should be < 1MB for 10 refs
|
||||
f"{platform} package too large: {metrics['size_kb']:.1f}KB"
|
||||
metrics["size_kb"],
|
||||
1000, # Should be < 1MB for 10 refs
|
||||
f"{platform} package too large: {metrics['size_kb']:.1f}KB",
|
||||
)
|
||||
|
||||
def test_benchmark_scaling_with_reference_count(self):
|
||||
@@ -210,14 +216,18 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
json.loads(formatted)
|
||||
size_kb = len(formatted) / 1024
|
||||
|
||||
results.append({
|
||||
"count": ref_count,
|
||||
"time": elapsed,
|
||||
"time_per_ref": time_per_ref,
|
||||
"size_kb": size_kb
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"count": ref_count,
|
||||
"time": elapsed,
|
||||
"time_per_ref": time_per_ref,
|
||||
"size_kb": size_kb,
|
||||
}
|
||||
)
|
||||
|
||||
print(f"{ref_count:4} | {elapsed*1000:10.2f} | {time_per_ref*1000:10.3f} | {size_kb:10.1f}")
|
||||
print(
|
||||
f"{ref_count:4} | {elapsed * 1000:10.2f} | {time_per_ref * 1000:10.3f} | {size_kb:10.1f}"
|
||||
)
|
||||
|
||||
# Analyze scaling behavior
|
||||
# Time per ref should not increase significantly (linear scaling)
|
||||
@@ -230,10 +240,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
print(f"(Time per ref at 50 refs / Time per ref at 1 ref)")
|
||||
|
||||
# Assert linear or sub-linear scaling (not exponential)
|
||||
self.assertLess(
|
||||
scaling_factor, 3.0,
|
||||
f"Non-linear scaling detected: {scaling_factor:.2f}x"
|
||||
)
|
||||
self.assertLess(scaling_factor, 3.0, f"Non-linear scaling detected: {scaling_factor:.2f}x")
|
||||
|
||||
def test_benchmark_json_vs_zip_size_comparison(self):
|
||||
"""Compare output sizes: JSON vs ZIP/tar.gz"""
|
||||
@@ -263,16 +270,15 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
|
||||
size_kb = package_path.stat().st_size / 1024
|
||||
|
||||
results[platform] = {
|
||||
"format": format_name,
|
||||
"size_kb": size_kb
|
||||
}
|
||||
results[platform] = {"format": format_name, "size_kb": size_kb}
|
||||
|
||||
print(f"{platform:15} | {format_name:8} | {size_kb:10.1f}")
|
||||
|
||||
# Analyze results
|
||||
json_sizes = [v["size_kb"] for k, v in results.items() if v["format"] == "JSON"]
|
||||
compressed_sizes = [v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"]]
|
||||
compressed_sizes = [
|
||||
v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"]
|
||||
]
|
||||
|
||||
if json_sizes and compressed_sizes:
|
||||
avg_json = sum(json_sizes) / len(json_sizes)
|
||||
@@ -280,7 +286,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
|
||||
print(f"\nAverage JSON size: {avg_json:.1f} KB")
|
||||
print(f"Average compressed size: {avg_compressed:.1f} KB")
|
||||
print(f"Compression ratio: {avg_json/avg_compressed:.2f}x")
|
||||
print(f"Compression ratio: {avg_json / avg_compressed:.2f}x")
|
||||
|
||||
def test_benchmark_metadata_overhead(self):
|
||||
"""Measure metadata processing overhead"""
|
||||
@@ -299,7 +305,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
description="A comprehensive test skill for benchmarking purposes",
|
||||
version="2.5.0",
|
||||
author="Benchmark Suite",
|
||||
tags=["test", "benchmark", "performance", "validation", "quality"]
|
||||
tags=["test", "benchmark", "performance", "validation", "quality"],
|
||||
)
|
||||
|
||||
adaptor = get_adaptor("langchain")
|
||||
@@ -326,15 +332,12 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
overhead = avg_rich - avg_minimal
|
||||
overhead_pct = (overhead / avg_minimal) * 100
|
||||
|
||||
print(f"\nMinimal metadata: {avg_minimal*1000:.2f}ms")
|
||||
print(f"Rich metadata: {avg_rich*1000:.2f}ms")
|
||||
print(f"Overhead: {overhead*1000:.2f}ms ({overhead_pct:.1f}%)")
|
||||
print(f"\nMinimal metadata: {avg_minimal * 1000:.2f}ms")
|
||||
print(f"Rich metadata: {avg_rich * 1000:.2f}ms")
|
||||
print(f"Overhead: {overhead * 1000:.2f}ms ({overhead_pct:.1f}%)")
|
||||
|
||||
# Overhead should be negligible (< 10%)
|
||||
self.assertLess(
|
||||
overhead_pct, 10.0,
|
||||
f"Metadata overhead too high: {overhead_pct:.1f}%"
|
||||
)
|
||||
self.assertLess(overhead_pct, 10.0, f"Metadata overhead too high: {overhead_pct:.1f}%")
|
||||
|
||||
def test_benchmark_empty_vs_full_skill(self):
|
||||
"""Compare performance: empty skill vs full skill"""
|
||||
@@ -360,9 +363,9 @@ class TestAdaptorBenchmarks(unittest.TestCase):
|
||||
adaptor.format_skill_md(full_dir, metadata)
|
||||
full_time = time.perf_counter() - start
|
||||
|
||||
print(f"\nEmpty skill: {empty_time*1000:.2f}ms")
|
||||
print(f"Full skill (50 refs): {full_time*1000:.2f}ms")
|
||||
print(f"Ratio: {full_time/empty_time:.1f}x")
|
||||
print(f"\nEmpty skill: {empty_time * 1000:.2f}ms")
|
||||
print(f"Full skill (50 refs): {full_time * 1000:.2f}ms")
|
||||
print(f"Ratio: {full_time / empty_time:.1f}x")
|
||||
|
||||
# Empty should be very fast
|
||||
self.assertLess(empty_time, 0.01, "Empty skill processing too slow")
|
||||
|
||||
@@ -662,8 +662,13 @@ export default {
|
||||
def test_e2e_all_rag_adaptors_from_same_skill(self):
|
||||
"""Test all 7 RAG adaptors can package the same skill"""
|
||||
rag_platforms = [
|
||||
"langchain", "llama-index", "haystack",
|
||||
"weaviate", "chroma", "faiss", "qdrant"
|
||||
"langchain",
|
||||
"llama-index",
|
||||
"haystack",
|
||||
"weaviate",
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
]
|
||||
packages = {}
|
||||
|
||||
@@ -674,15 +679,11 @@ export default {
|
||||
package_path = adaptor.package(self.skill_dir, self.output_dir)
|
||||
|
||||
# Verify package was created
|
||||
self.assertTrue(
|
||||
package_path.exists(),
|
||||
f"Package not created for {platform}"
|
||||
)
|
||||
self.assertTrue(package_path.exists(), f"Package not created for {platform}")
|
||||
|
||||
# Verify it's a JSON file
|
||||
self.assertTrue(
|
||||
str(package_path).endswith(".json"),
|
||||
f"{platform} should produce JSON file"
|
||||
str(package_path).endswith(".json"), f"{platform} should produce JSON file"
|
||||
)
|
||||
|
||||
# Store for later verification
|
||||
@@ -696,10 +697,7 @@ export default {
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
# Should be valid JSON (dict or list)
|
||||
self.assertIsInstance(
|
||||
data, (dict, list),
|
||||
f"{platform} should produce valid JSON"
|
||||
)
|
||||
self.assertIsInstance(data, (dict, list), f"{platform} should produce valid JSON")
|
||||
|
||||
def test_e2e_rag_adaptors_preserve_metadata(self):
|
||||
"""Test that metadata is preserved across RAG adaptors"""
|
||||
@@ -708,7 +706,7 @@ export default {
|
||||
description="Vue.js framework skill",
|
||||
version="2.0.0",
|
||||
author="Test Author",
|
||||
tags=["vue", "javascript", "frontend"]
|
||||
tags=["vue", "javascript", "frontend"],
|
||||
)
|
||||
|
||||
# Test subset of platforms (representative sample)
|
||||
@@ -758,33 +756,30 @@ export default {
|
||||
# Define expected structure for each platform
|
||||
validations = {
|
||||
"langchain": lambda d: (
|
||||
isinstance(d, list) and
|
||||
all("page_content" in item and "metadata" in item for item in d)
|
||||
isinstance(d, list)
|
||||
and all("page_content" in item and "metadata" in item for item in d)
|
||||
),
|
||||
"llama-index": lambda d: (
|
||||
isinstance(d, list) and
|
||||
all("text" in item and "metadata" in item for item in d)
|
||||
isinstance(d, list) and all("text" in item and "metadata" in item for item in d)
|
||||
),
|
||||
"haystack": lambda d: (
|
||||
isinstance(d, list) and
|
||||
all("content" in item and "meta" in item for item in d)
|
||||
isinstance(d, list) and all("content" in item and "meta" in item for item in d)
|
||||
),
|
||||
"weaviate": lambda d: (
|
||||
isinstance(d, dict) and
|
||||
"schema" in d and "objects" in d and "class_name" in d
|
||||
isinstance(d, dict) and "schema" in d and "objects" in d and "class_name" in d
|
||||
),
|
||||
"chroma": lambda d: (
|
||||
isinstance(d, dict) and
|
||||
"documents" in d and "metadatas" in d and "ids" in d and
|
||||
"collection_name" in d
|
||||
isinstance(d, dict)
|
||||
and "documents" in d
|
||||
and "metadatas" in d
|
||||
and "ids" in d
|
||||
and "collection_name" in d
|
||||
),
|
||||
"faiss": lambda d: (
|
||||
isinstance(d, dict) and
|
||||
"documents" in d and "metadatas" in d and "ids" in d
|
||||
isinstance(d, dict) and "documents" in d and "metadatas" in d and "ids" in d
|
||||
),
|
||||
"qdrant": lambda d: (
|
||||
isinstance(d, dict) and
|
||||
"collection_name" in d and "points" in d and "config" in d
|
||||
isinstance(d, dict) and "collection_name" in d and "points" in d and "config" in d
|
||||
),
|
||||
}
|
||||
|
||||
@@ -795,8 +790,7 @@ export default {
|
||||
|
||||
# Validate structure
|
||||
self.assertTrue(
|
||||
validate_func(data),
|
||||
f"{platform} validation failed: incorrect JSON structure"
|
||||
validate_func(data), f"{platform} validation failed: incorrect JSON structure"
|
||||
)
|
||||
|
||||
def test_e2e_rag_empty_skill_handling(self):
|
||||
@@ -838,9 +832,7 @@ export default {
|
||||
if platform == "langchain":
|
||||
categories = {item["metadata"]["category"] for item in data}
|
||||
elif platform == "weaviate":
|
||||
categories = {
|
||||
obj["properties"]["category"] for obj in data["objects"]
|
||||
}
|
||||
categories = {obj["properties"]["category"] for obj in data["objects"]}
|
||||
elif platform == "chroma":
|
||||
categories = {meta["category"] for meta in data["metadatas"]}
|
||||
|
||||
@@ -854,8 +846,7 @@ export default {
|
||||
# Check that at least one reference category exists
|
||||
ref_categories = categories - {"overview"}
|
||||
self.assertGreater(
|
||||
len(ref_categories), 0,
|
||||
f"{platform}: Should have at least one reference category"
|
||||
len(ref_categories), 0, f"{platform}: Should have at least one reference category"
|
||||
)
|
||||
|
||||
def test_e2e_rag_integration_workflow_chromadb(self):
|
||||
@@ -878,17 +869,10 @@ export default {
|
||||
|
||||
# Create collection and add documents
|
||||
collection = client.create_collection(data["collection_name"])
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
)
|
||||
collection.add(documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"])
|
||||
|
||||
# Query
|
||||
results = collection.query(
|
||||
query_texts=["reactivity"],
|
||||
n_results=2
|
||||
)
|
||||
results = collection.query(query_texts=["reactivity"], n_results=2)
|
||||
|
||||
# Verify results
|
||||
self.assertGreater(len(results["documents"][0]), 0, "Should return results")
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestChromaAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for Chroma format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for Chroma format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestChromaAdaptor:
|
||||
|
||||
# Format as Chroma collection
|
||||
adaptor = get_adaptor("chroma")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
collection_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -124,7 +120,10 @@ class TestChromaAdaptor:
|
||||
# Upload may fail if chromadb not installed (expected)
|
||||
assert "message" in result
|
||||
# Either chromadb not installed or connection error
|
||||
assert ("chromadb not installed" in result["message"] or "Failed to connect" in result["message"])
|
||||
assert (
|
||||
"chromadb not installed" in result["message"]
|
||||
or "Failed to connect" in result["message"]
|
||||
)
|
||||
|
||||
def test_validate_api_key_returns_false(self):
|
||||
"""Test that API key validation returns False (no API needed)."""
|
||||
@@ -157,9 +156,7 @@ class TestChromaAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("chroma")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
collection_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
collection = json.loads(collection_json)
|
||||
@@ -179,9 +176,7 @@ class TestChromaAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("chroma")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
collection_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
collection = json.loads(collection_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestFAISSAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for FAISS format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for FAISS format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestFAISSAdaptor:
|
||||
|
||||
# Format as FAISS index data
|
||||
adaptor = get_adaptor("faiss")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
index_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -158,9 +154,7 @@ class TestFAISSAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("faiss")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
index_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
index_data = json.loads(index_json)
|
||||
@@ -180,9 +174,7 @@ class TestFAISSAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("faiss")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
index_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
index_data = json.loads(index_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestHaystackAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for Haystack format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for Haystack format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestHaystackAdaptor:
|
||||
|
||||
# Format as Haystack Documents
|
||||
adaptor = get_adaptor("haystack")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -112,7 +108,7 @@ class TestHaystackAdaptor:
|
||||
"""Test upload returns instructions (no actual upload)."""
|
||||
# Create test package
|
||||
package_path = tmp_path / "test-haystack.json"
|
||||
package_path.write_text('[]')
|
||||
package_path.write_text("[]")
|
||||
|
||||
adaptor = get_adaptor("haystack")
|
||||
result = adaptor.upload(package_path, "fake-key")
|
||||
@@ -154,9 +150,7 @@ class TestHaystackAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("haystack")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
@@ -174,9 +168,7 @@ class TestHaystackAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("haystack")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestLangChainAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for LangChain format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for LangChain format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestLangChainAdaptor:
|
||||
|
||||
# Format as LangChain Documents
|
||||
adaptor = get_adaptor("langchain")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -112,7 +108,7 @@ class TestLangChainAdaptor:
|
||||
"""Test upload returns instructions (no actual upload)."""
|
||||
# Create test package
|
||||
package_path = tmp_path / "test-langchain.json"
|
||||
package_path.write_text('[]')
|
||||
package_path.write_text("[]")
|
||||
|
||||
adaptor = get_adaptor("langchain")
|
||||
result = adaptor.upload(package_path, "fake-key")
|
||||
@@ -153,9 +149,7 @@ class TestLangChainAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("langchain")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
@@ -173,9 +167,7 @@ class TestLangChainAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("langchain")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestLlamaIndexAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for LlamaIndex format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for LlamaIndex format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestLlamaIndexAdaptor:
|
||||
|
||||
# Format as LlamaIndex Documents
|
||||
adaptor = get_adaptor("llama-index")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -112,7 +108,7 @@ class TestLlamaIndexAdaptor:
|
||||
"""Test upload returns instructions (no actual upload)."""
|
||||
# Create test package
|
||||
package_path = tmp_path / "test-llama-index.json"
|
||||
package_path.write_text('[]')
|
||||
package_path.write_text("[]")
|
||||
|
||||
adaptor = get_adaptor("llama-index")
|
||||
result = adaptor.upload(package_path, "fake-key")
|
||||
@@ -153,9 +149,7 @@ class TestLlamaIndexAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("llama-index")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
@@ -173,9 +167,7 @@ class TestLlamaIndexAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("llama-index")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
documents_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
documents = json.loads(documents_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestQdrantAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for Qdrant format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for Qdrant format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestQdrantAdaptor:
|
||||
|
||||
# Format as Qdrant points
|
||||
adaptor = get_adaptor("qdrant")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
points_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -119,7 +115,7 @@ class TestQdrantAdaptor:
|
||||
"""Test upload returns instructions (no actual upload)."""
|
||||
# Create test package
|
||||
package_path = tmp_path / "test-qdrant.json"
|
||||
package_path.write_text('[]')
|
||||
package_path.write_text("[]")
|
||||
|
||||
adaptor = get_adaptor("qdrant")
|
||||
result = adaptor.upload(package_path, "fake-key")
|
||||
@@ -160,9 +156,7 @@ class TestQdrantAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("qdrant")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
points_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
result = json.loads(points_json)
|
||||
@@ -181,9 +175,7 @@ class TestQdrantAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("qdrant")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
points_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
result = json.loads(points_json)
|
||||
|
||||
@@ -28,9 +28,7 @@ class TestWeaviateAdaptor:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text(
|
||||
"# Test Skill\n\nThis is a test skill for Weaviate format."
|
||||
)
|
||||
skill_md.write_text("# Test Skill\n\nThis is a test skill for Weaviate format.")
|
||||
|
||||
# Create references directory with files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -40,9 +38,7 @@ class TestWeaviateAdaptor:
|
||||
|
||||
# Format as Weaviate objects
|
||||
adaptor = get_adaptor("weaviate")
|
||||
metadata = SkillMetadata(
|
||||
name="test_skill", description="Test skill", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
|
||||
|
||||
objects_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
|
||||
@@ -119,7 +115,7 @@ class TestWeaviateAdaptor:
|
||||
"""Test upload returns instructions (no actual upload)."""
|
||||
# Create test package
|
||||
package_path = tmp_path / "test-weaviate.json"
|
||||
package_path.write_text('[]')
|
||||
package_path.write_text("[]")
|
||||
|
||||
adaptor = get_adaptor("weaviate")
|
||||
result = adaptor.upload(package_path, "fake-key")
|
||||
@@ -127,7 +123,11 @@ class TestWeaviateAdaptor:
|
||||
# Upload may fail if weaviate not installed (expected)
|
||||
assert "message" in result
|
||||
# Either weaviate not installed, invalid JSON, or connection error
|
||||
assert ("import weaviate" in result["message"] or "Failed to connect" in result["message"] or result["success"] is False)
|
||||
assert (
|
||||
"import weaviate" in result["message"]
|
||||
or "Failed to connect" in result["message"]
|
||||
or result["success"] is False
|
||||
)
|
||||
|
||||
def test_validate_api_key_returns_false(self):
|
||||
"""Test that API key validation returns False (no API needed)."""
|
||||
@@ -160,9 +160,7 @@ class TestWeaviateAdaptor:
|
||||
skill_dir.mkdir()
|
||||
|
||||
adaptor = get_adaptor("weaviate")
|
||||
metadata = SkillMetadata(
|
||||
name="empty_skill", description="Empty", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
|
||||
|
||||
objects_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
result = json.loads(objects_json)
|
||||
@@ -181,9 +179,7 @@ class TestWeaviateAdaptor:
|
||||
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
|
||||
|
||||
adaptor = get_adaptor("weaviate")
|
||||
metadata = SkillMetadata(
|
||||
name="refs_only", description="Refs only", version="1.0.0"
|
||||
)
|
||||
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
|
||||
|
||||
objects_json = adaptor.format_skill_md(skill_dir, metadata)
|
||||
result = json.loads(objects_json)
|
||||
|
||||
@@ -12,7 +12,7 @@ from skill_seekers.benchmark import (
|
||||
BenchmarkResult,
|
||||
BenchmarkRunner,
|
||||
BenchmarkReport,
|
||||
Metric
|
||||
Metric,
|
||||
)
|
||||
from skill_seekers.benchmark.models import TimingResult, MemoryUsage
|
||||
|
||||
@@ -37,12 +37,7 @@ class TestBenchmarkResult:
|
||||
"""Test adding timing result."""
|
||||
result = BenchmarkResult("test")
|
||||
|
||||
timing = TimingResult(
|
||||
operation="test_op",
|
||||
duration=1.5,
|
||||
iterations=1,
|
||||
avg_duration=1.5
|
||||
)
|
||||
timing = TimingResult(operation="test_op", duration=1.5, iterations=1, avg_duration=1.5)
|
||||
|
||||
result.add_timing(timing)
|
||||
|
||||
@@ -55,11 +50,7 @@ class TestBenchmarkResult:
|
||||
result = BenchmarkResult("test")
|
||||
|
||||
usage = MemoryUsage(
|
||||
operation="test_op",
|
||||
before_mb=100.0,
|
||||
after_mb=150.0,
|
||||
peak_mb=160.0,
|
||||
allocated_mb=50.0
|
||||
operation="test_op", before_mb=100.0, after_mb=150.0, peak_mb=160.0, allocated_mb=50.0
|
||||
)
|
||||
|
||||
result.add_memory(usage)
|
||||
@@ -72,11 +63,7 @@ class TestBenchmarkResult:
|
||||
"""Test adding custom metric."""
|
||||
result = BenchmarkResult("test")
|
||||
|
||||
metric = Metric(
|
||||
name="pages_per_sec",
|
||||
value=12.5,
|
||||
unit="pages/sec"
|
||||
)
|
||||
metric = Metric(name="pages_per_sec", value=12.5, unit="pages/sec")
|
||||
|
||||
result.add_metric(metric)
|
||||
|
||||
@@ -107,12 +94,7 @@ class TestBenchmarkResult:
|
||||
"""Test report generation."""
|
||||
result = BenchmarkResult("test")
|
||||
|
||||
timing = TimingResult(
|
||||
operation="test_op",
|
||||
duration=1.0,
|
||||
iterations=1,
|
||||
avg_duration=1.0
|
||||
)
|
||||
timing = TimingResult(operation="test_op", duration=1.0, iterations=1, avg_duration=1.0)
|
||||
result.add_timing(timing)
|
||||
|
||||
report = result.to_report()
|
||||
@@ -303,7 +285,7 @@ class TestBenchmark:
|
||||
before_mb=100.0,
|
||||
after_mb=1200.0,
|
||||
peak_mb=1500.0,
|
||||
allocated_mb=1100.0
|
||||
allocated_mb=1100.0,
|
||||
)
|
||||
benchmark.result.add_memory(usage)
|
||||
|
||||
@@ -370,10 +352,7 @@ class TestBenchmarkRunner:
|
||||
with bench.timer("op2"):
|
||||
time.sleep(0.03)
|
||||
|
||||
reports = runner.run_suite({
|
||||
"test1": bench1,
|
||||
"test2": bench2
|
||||
})
|
||||
reports = runner.run_suite({"test1": bench1, "test2": bench2})
|
||||
|
||||
assert len(reports) == 2
|
||||
assert "test1" in reports
|
||||
@@ -405,6 +384,7 @@ class TestBenchmarkRunner:
|
||||
|
||||
# Compare
|
||||
from skill_seekers.benchmark.models import ComparisonReport
|
||||
|
||||
comparison = runner.compare(baseline_path, improved_path)
|
||||
|
||||
assert isinstance(comparison, ComparisonReport)
|
||||
@@ -458,6 +438,7 @@ class TestBenchmarkRunner:
|
||||
def test_cleanup_old(self, tmp_path):
|
||||
"""Test cleaning up old benchmarks."""
|
||||
import os
|
||||
|
||||
runner = BenchmarkRunner(output_dir=tmp_path)
|
||||
|
||||
# Create 10 benchmark files with different timestamps
|
||||
@@ -476,10 +457,10 @@ class TestBenchmarkRunner:
|
||||
"memory": [],
|
||||
"metrics": [],
|
||||
"system_info": {},
|
||||
"recommendations": []
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
with open(file_path, 'w') as f:
|
||||
with open(file_path, "w") as f:
|
||||
json.dump(report_data, f)
|
||||
|
||||
# Set different modification times
|
||||
@@ -505,12 +486,7 @@ class TestBenchmarkModels:
|
||||
|
||||
def test_timing_result_model(self):
|
||||
"""Test TimingResult model."""
|
||||
timing = TimingResult(
|
||||
operation="test",
|
||||
duration=1.5,
|
||||
iterations=10,
|
||||
avg_duration=0.15
|
||||
)
|
||||
timing = TimingResult(operation="test", duration=1.5, iterations=10, avg_duration=0.15)
|
||||
|
||||
assert timing.operation == "test"
|
||||
assert timing.duration == 1.5
|
||||
@@ -520,11 +496,7 @@ class TestBenchmarkModels:
|
||||
def test_memory_usage_model(self):
|
||||
"""Test MemoryUsage model."""
|
||||
usage = MemoryUsage(
|
||||
operation="allocate",
|
||||
before_mb=100.0,
|
||||
after_mb=200.0,
|
||||
peak_mb=250.0,
|
||||
allocated_mb=100.0
|
||||
operation="allocate", before_mb=100.0, after_mb=200.0, peak_mb=250.0, allocated_mb=100.0
|
||||
)
|
||||
|
||||
assert usage.operation == "allocate"
|
||||
@@ -533,11 +505,7 @@ class TestBenchmarkModels:
|
||||
|
||||
def test_metric_model(self):
|
||||
"""Test Metric model."""
|
||||
metric = Metric(
|
||||
name="throughput",
|
||||
value=125.5,
|
||||
unit="ops/sec"
|
||||
)
|
||||
metric = Metric(name="throughput", value=125.5, unit="ops/sec")
|
||||
|
||||
assert metric.name == "throughput"
|
||||
assert metric.value == 125.5
|
||||
@@ -551,26 +519,19 @@ class TestBenchmarkModels:
|
||||
started_at=datetime.utcnow(),
|
||||
finished_at=datetime.utcnow(),
|
||||
total_duration=5.0,
|
||||
timings=[
|
||||
TimingResult(
|
||||
operation="op1",
|
||||
duration=2.0,
|
||||
iterations=1,
|
||||
avg_duration=2.0
|
||||
)
|
||||
],
|
||||
timings=[TimingResult(operation="op1", duration=2.0, iterations=1, avg_duration=2.0)],
|
||||
memory=[
|
||||
MemoryUsage(
|
||||
operation="op1",
|
||||
before_mb=100.0,
|
||||
after_mb=200.0,
|
||||
peak_mb=250.0,
|
||||
allocated_mb=100.0
|
||||
allocated_mb=100.0,
|
||||
)
|
||||
],
|
||||
metrics=[],
|
||||
system_info={},
|
||||
recommendations=[]
|
||||
recommendations=[],
|
||||
)
|
||||
|
||||
summary = report.summary
|
||||
@@ -592,7 +553,7 @@ class TestBenchmarkModels:
|
||||
memory=[],
|
||||
metrics=[],
|
||||
system_info={},
|
||||
recommendations=[]
|
||||
recommendations=[],
|
||||
)
|
||||
|
||||
current = BenchmarkReport(
|
||||
@@ -604,7 +565,7 @@ class TestBenchmarkModels:
|
||||
memory=[],
|
||||
metrics=[],
|
||||
system_info={},
|
||||
recommendations=[]
|
||||
recommendations=[],
|
||||
)
|
||||
|
||||
comparison = ComparisonReport(
|
||||
@@ -614,7 +575,7 @@ class TestBenchmarkModels:
|
||||
improvements=[],
|
||||
regressions=["Slower performance"],
|
||||
speedup_factor=0.5,
|
||||
memory_change_mb=0.0
|
||||
memory_change_mb=0.0,
|
||||
)
|
||||
|
||||
assert comparison.has_regressions is True
|
||||
@@ -632,7 +593,7 @@ class TestBenchmarkModels:
|
||||
memory=[],
|
||||
metrics=[],
|
||||
system_info={},
|
||||
recommendations=[]
|
||||
recommendations=[],
|
||||
)
|
||||
|
||||
current = BenchmarkReport(
|
||||
@@ -644,7 +605,7 @@ class TestBenchmarkModels:
|
||||
memory=[],
|
||||
metrics=[],
|
||||
system_info={},
|
||||
recommendations=[]
|
||||
recommendations=[],
|
||||
)
|
||||
|
||||
comparison = ComparisonReport(
|
||||
@@ -654,7 +615,7 @@ class TestBenchmarkModels:
|
||||
improvements=[],
|
||||
regressions=[],
|
||||
speedup_factor=2.0,
|
||||
memory_change_mb=0.0
|
||||
memory_change_mb=0.0,
|
||||
)
|
||||
|
||||
improvement = comparison.overall_improvement
|
||||
|
||||
@@ -60,7 +60,7 @@ class TestChunkingDisabledByDefault:
|
||||
"""Test that LangChain doesn't chunk by default."""
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
|
||||
adaptor = get_adaptor('langchain')
|
||||
adaptor = get_adaptor("langchain")
|
||||
package_path = adaptor.package(skill_dir, tmp_path)
|
||||
|
||||
with open(package_path) as f:
|
||||
@@ -71,8 +71,8 @@ class TestChunkingDisabledByDefault:
|
||||
|
||||
# No chunking metadata
|
||||
for doc in data:
|
||||
assert 'is_chunked' not in doc['metadata']
|
||||
assert 'chunk_index' not in doc['metadata']
|
||||
assert "is_chunked" not in doc["metadata"]
|
||||
assert "chunk_index" not in doc["metadata"]
|
||||
|
||||
|
||||
class TestChunkingEnabled:
|
||||
@@ -82,12 +82,9 @@ class TestChunkingEnabled:
|
||||
"""Test that LangChain chunks large documents when enabled."""
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
|
||||
adaptor = get_adaptor('langchain')
|
||||
adaptor = get_adaptor("langchain")
|
||||
package_path = adaptor.package(
|
||||
skill_dir,
|
||||
tmp_path,
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=512
|
||||
skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
|
||||
)
|
||||
|
||||
with open(package_path) as f:
|
||||
@@ -97,25 +94,22 @@ class TestChunkingEnabled:
|
||||
assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs"
|
||||
|
||||
# Check for chunking metadata
|
||||
chunked_docs = [doc for doc in data if doc['metadata'].get('is_chunked')]
|
||||
chunked_docs = [doc for doc in data if doc["metadata"].get("is_chunked")]
|
||||
assert len(chunked_docs) > 0, "Should have chunked documents"
|
||||
|
||||
# Verify chunk metadata structure
|
||||
for doc in chunked_docs:
|
||||
assert 'chunk_index' in doc['metadata']
|
||||
assert 'total_chunks' in doc['metadata']
|
||||
assert 'chunk_id' in doc['metadata']
|
||||
assert "chunk_index" in doc["metadata"]
|
||||
assert "total_chunks" in doc["metadata"]
|
||||
assert "chunk_id" in doc["metadata"]
|
||||
|
||||
def test_chunking_preserves_small_docs(self, tmp_path):
|
||||
"""Test that small documents are not chunked."""
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=False)
|
||||
|
||||
adaptor = get_adaptor('langchain')
|
||||
adaptor = get_adaptor("langchain")
|
||||
package_path = adaptor.package(
|
||||
skill_dir,
|
||||
tmp_path,
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=512
|
||||
skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
|
||||
)
|
||||
|
||||
with open(package_path) as f:
|
||||
@@ -125,7 +119,7 @@ class TestChunkingEnabled:
|
||||
assert len(data) == 2, "Small docs should not be chunked"
|
||||
|
||||
for doc in data:
|
||||
assert 'is_chunked' not in doc['metadata']
|
||||
assert "is_chunked" not in doc["metadata"]
|
||||
|
||||
|
||||
class TestCodeBlockPreservation:
|
||||
@@ -158,43 +152,43 @@ More content after code block.
|
||||
# Create references dir (required)
|
||||
(skill_dir / "references").mkdir()
|
||||
|
||||
adaptor = get_adaptor('langchain')
|
||||
adaptor = get_adaptor("langchain")
|
||||
package_path = adaptor.package(
|
||||
skill_dir,
|
||||
tmp_path,
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=200, # Small chunks to force splitting
|
||||
preserve_code_blocks=True
|
||||
preserve_code_blocks=True,
|
||||
)
|
||||
|
||||
with open(package_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Find chunks with code block
|
||||
code_chunks = [
|
||||
doc for doc in data
|
||||
if '```python' in doc['page_content']
|
||||
]
|
||||
code_chunks = [doc for doc in data if "```python" in doc["page_content"]]
|
||||
|
||||
# Code block should be in at least one chunk
|
||||
assert len(code_chunks) >= 1, "Code block should be preserved"
|
||||
|
||||
# Code block should be complete (opening and closing backticks)
|
||||
for chunk in code_chunks:
|
||||
content = chunk['page_content']
|
||||
if '```python' in content:
|
||||
content = chunk["page_content"]
|
||||
if "```python" in content:
|
||||
# Should also have closing backticks
|
||||
assert content.count('```') >= 2, "Code block should be complete"
|
||||
assert content.count("```") >= 2, "Code block should be complete"
|
||||
|
||||
|
||||
class TestAutoChunkingForRAGPlatforms:
|
||||
"""Test that chunking is auto-enabled for RAG platforms."""
|
||||
|
||||
@pytest.mark.parametrize("platform", [
|
||||
'langchain',
|
||||
# Add others after they're updated:
|
||||
# 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"platform",
|
||||
[
|
||||
"langchain",
|
||||
# Add others after they're updated:
|
||||
# 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
|
||||
],
|
||||
)
|
||||
def test_rag_platforms_auto_chunk(self, platform, tmp_path):
|
||||
"""Test that RAG platforms auto-enable chunking."""
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
@@ -208,7 +202,7 @@ class TestAutoChunkingForRAGPlatforms:
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target=platform,
|
||||
enable_chunking=False # Explicitly disabled, but should be auto-enabled
|
||||
enable_chunking=False, # Explicitly disabled, but should be auto-enabled
|
||||
)
|
||||
|
||||
assert success, f"Packaging failed for {platform}"
|
||||
@@ -221,8 +215,8 @@ class TestAutoChunkingForRAGPlatforms:
|
||||
# Should have multiple documents/chunks
|
||||
if isinstance(data, list):
|
||||
assert len(data) > 2, f"{platform}: Should auto-chunk large docs"
|
||||
elif isinstance(data, dict) and 'documents' in data:
|
||||
assert len(data['documents']) > 2, f"{platform}: Should auto-chunk large docs"
|
||||
elif isinstance(data, dict) and "documents" in data:
|
||||
assert len(data["documents"]) > 2, f"{platform}: Should auto-chunk large docs"
|
||||
|
||||
|
||||
class TestBaseAdaptorChunkingHelper:
|
||||
@@ -237,11 +231,7 @@ class TestBaseAdaptorChunkingHelper:
|
||||
content = "Test content " * 1000 # Large content
|
||||
metadata = {"source": "test"}
|
||||
|
||||
chunks = adaptor._maybe_chunk_content(
|
||||
content,
|
||||
metadata,
|
||||
enable_chunking=False
|
||||
)
|
||||
chunks = adaptor._maybe_chunk_content(content, metadata, enable_chunking=False)
|
||||
|
||||
# Should return single chunk
|
||||
assert len(chunks) == 1
|
||||
@@ -258,10 +248,7 @@ class TestBaseAdaptorChunkingHelper:
|
||||
metadata = {"source": "test"}
|
||||
|
||||
chunks = adaptor._maybe_chunk_content(
|
||||
content,
|
||||
metadata,
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=512
|
||||
content, metadata, enable_chunking=True, chunk_max_tokens=512
|
||||
)
|
||||
|
||||
# Should return single chunk
|
||||
@@ -282,7 +269,7 @@ class TestBaseAdaptorChunkingHelper:
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=512,
|
||||
preserve_code_blocks=True,
|
||||
source_file="test.md"
|
||||
source_file="test.md",
|
||||
)
|
||||
|
||||
# Should return multiple chunks
|
||||
@@ -292,12 +279,12 @@ class TestBaseAdaptorChunkingHelper:
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
assert isinstance(chunk_text, str)
|
||||
assert isinstance(chunk_meta, dict)
|
||||
assert chunk_meta['is_chunked']
|
||||
assert 'chunk_index' in chunk_meta
|
||||
assert 'chunk_id' in chunk_meta
|
||||
assert chunk_meta["is_chunked"]
|
||||
assert "chunk_index" in chunk_meta
|
||||
assert "chunk_id" in chunk_meta
|
||||
# Original metadata preserved
|
||||
assert chunk_meta['source'] == 'test'
|
||||
assert chunk_meta['file'] == 'test.md'
|
||||
assert chunk_meta["source"] == "test"
|
||||
assert chunk_meta["file"] == "test.md"
|
||||
|
||||
|
||||
class TestChunkingCLIIntegration:
|
||||
@@ -313,10 +300,10 @@ class TestChunkingCLIIntegration:
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target='langchain',
|
||||
target="langchain",
|
||||
enable_chunking=True, # --chunk flag
|
||||
chunk_max_tokens=512,
|
||||
preserve_code_blocks=True
|
||||
preserve_code_blocks=True,
|
||||
)
|
||||
|
||||
assert success
|
||||
@@ -339,10 +326,10 @@ class TestChunkingCLIIntegration:
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target='langchain',
|
||||
target="langchain",
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=256, # Small chunks
|
||||
preserve_code_blocks=True
|
||||
preserve_code_blocks=True,
|
||||
)
|
||||
|
||||
assert success
|
||||
@@ -355,10 +342,10 @@ class TestChunkingCLIIntegration:
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target='langchain',
|
||||
target="langchain",
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=1024, # Large chunks
|
||||
preserve_code_blocks=True
|
||||
preserve_code_blocks=True,
|
||||
)
|
||||
|
||||
assert success
|
||||
@@ -367,9 +354,10 @@ class TestChunkingCLIIntegration:
|
||||
data_large = json.load(f)
|
||||
|
||||
# Small chunk size should produce more chunks
|
||||
assert len(data_small) > len(data_large), \
|
||||
assert len(data_small) > len(data_large), (
|
||||
f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -30,12 +30,12 @@ class TestParserRegistry:
|
||||
"""Test getting list of parser names."""
|
||||
names = get_parser_names()
|
||||
assert len(names) == 19
|
||||
assert 'scrape' in names
|
||||
assert 'github' in names
|
||||
assert 'package' in names
|
||||
assert 'upload' in names
|
||||
assert 'analyze' in names
|
||||
assert 'config' in names
|
||||
assert "scrape" in names
|
||||
assert "github" in names
|
||||
assert "package" in names
|
||||
assert "upload" in names
|
||||
assert "analyze" in names
|
||||
assert "config" in names
|
||||
|
||||
def test_all_parsers_are_subcommand_parsers(self):
|
||||
"""Test that all parsers inherit from SubcommandParser."""
|
||||
@@ -45,9 +45,9 @@ class TestParserRegistry:
|
||||
def test_all_parsers_have_required_properties(self):
|
||||
"""Test that all parsers have name, help, description."""
|
||||
for parser in PARSERS:
|
||||
assert hasattr(parser, 'name')
|
||||
assert hasattr(parser, 'help')
|
||||
assert hasattr(parser, 'description')
|
||||
assert hasattr(parser, "name")
|
||||
assert hasattr(parser, "help")
|
||||
assert hasattr(parser, "description")
|
||||
assert isinstance(parser.name, str)
|
||||
assert isinstance(parser.help, str)
|
||||
assert isinstance(parser.description, str)
|
||||
@@ -57,7 +57,7 @@ class TestParserRegistry:
|
||||
def test_all_parsers_have_add_arguments_method(self):
|
||||
"""Test that all parsers implement add_arguments."""
|
||||
for parser in PARSERS:
|
||||
assert hasattr(parser, 'add_arguments')
|
||||
assert hasattr(parser, "add_arguments")
|
||||
assert callable(parser.add_arguments)
|
||||
|
||||
def test_no_duplicate_parser_names(self):
|
||||
@@ -106,21 +106,21 @@ class TestParserCreation:
|
||||
def test_register_parsers_creates_all_subcommands(self):
|
||||
"""Test that register_parsers creates all 19 subcommands."""
|
||||
main_parser = argparse.ArgumentParser()
|
||||
subparsers = main_parser.add_subparsers(dest='command')
|
||||
subparsers = main_parser.add_subparsers(dest="command")
|
||||
|
||||
# Register all parsers
|
||||
register_parsers(subparsers)
|
||||
|
||||
# Test that all commands can be parsed
|
||||
test_commands = [
|
||||
'config --show',
|
||||
'scrape --config test.json',
|
||||
'github --repo owner/repo',
|
||||
'package output/test/',
|
||||
'upload test.zip',
|
||||
'analyze --directory .',
|
||||
'enhance output/test/',
|
||||
'estimate test.json',
|
||||
"config --show",
|
||||
"scrape --config test.json",
|
||||
"github --repo owner/repo",
|
||||
"package output/test/",
|
||||
"upload test.zip",
|
||||
"analyze --directory .",
|
||||
"enhance output/test/",
|
||||
"estimate test.json",
|
||||
]
|
||||
|
||||
for cmd in test_commands:
|
||||
@@ -134,75 +134,76 @@ class TestSpecificParsers:
|
||||
def test_scrape_parser_arguments(self):
|
||||
"""Test ScrapeParser has correct arguments."""
|
||||
main_parser = argparse.ArgumentParser()
|
||||
subparsers = main_parser.add_subparsers(dest='command')
|
||||
subparsers = main_parser.add_subparsers(dest="command")
|
||||
|
||||
scrape_parser = ScrapeParser()
|
||||
scrape_parser.create_parser(subparsers)
|
||||
|
||||
# Test various argument combinations
|
||||
args = main_parser.parse_args(['scrape', '--config', 'test.json'])
|
||||
assert args.command == 'scrape'
|
||||
assert args.config == 'test.json'
|
||||
args = main_parser.parse_args(["scrape", "--config", "test.json"])
|
||||
assert args.command == "scrape"
|
||||
assert args.config == "test.json"
|
||||
|
||||
args = main_parser.parse_args(['scrape', '--config', 'test.json', '--max-pages', '100'])
|
||||
args = main_parser.parse_args(["scrape", "--config", "test.json", "--max-pages", "100"])
|
||||
assert args.max_pages == 100
|
||||
|
||||
args = main_parser.parse_args(['scrape', '--enhance'])
|
||||
args = main_parser.parse_args(["scrape", "--enhance"])
|
||||
assert args.enhance is True
|
||||
|
||||
def test_github_parser_arguments(self):
|
||||
"""Test GitHubParser has correct arguments."""
|
||||
main_parser = argparse.ArgumentParser()
|
||||
subparsers = main_parser.add_subparsers(dest='command')
|
||||
subparsers = main_parser.add_subparsers(dest="command")
|
||||
|
||||
github_parser = GitHubParser()
|
||||
github_parser.create_parser(subparsers)
|
||||
|
||||
args = main_parser.parse_args(['github', '--repo', 'owner/repo'])
|
||||
assert args.command == 'github'
|
||||
assert args.repo == 'owner/repo'
|
||||
args = main_parser.parse_args(["github", "--repo", "owner/repo"])
|
||||
assert args.command == "github"
|
||||
assert args.repo == "owner/repo"
|
||||
|
||||
args = main_parser.parse_args(['github', '--repo', 'owner/repo', '--non-interactive'])
|
||||
args = main_parser.parse_args(["github", "--repo", "owner/repo", "--non-interactive"])
|
||||
assert args.non_interactive is True
|
||||
|
||||
def test_package_parser_arguments(self):
|
||||
"""Test PackageParser has correct arguments."""
|
||||
main_parser = argparse.ArgumentParser()
|
||||
subparsers = main_parser.add_subparsers(dest='command')
|
||||
subparsers = main_parser.add_subparsers(dest="command")
|
||||
|
||||
package_parser = PackageParser()
|
||||
package_parser.create_parser(subparsers)
|
||||
|
||||
args = main_parser.parse_args(['package', 'output/test/'])
|
||||
assert args.command == 'package'
|
||||
assert args.skill_directory == 'output/test/'
|
||||
args = main_parser.parse_args(["package", "output/test/"])
|
||||
assert args.command == "package"
|
||||
assert args.skill_directory == "output/test/"
|
||||
|
||||
args = main_parser.parse_args(['package', 'output/test/', '--target', 'gemini'])
|
||||
assert args.target == 'gemini'
|
||||
args = main_parser.parse_args(["package", "output/test/", "--target", "gemini"])
|
||||
assert args.target == "gemini"
|
||||
|
||||
args = main_parser.parse_args(['package', 'output/test/', '--no-open'])
|
||||
args = main_parser.parse_args(["package", "output/test/", "--no-open"])
|
||||
assert args.no_open is True
|
||||
|
||||
def test_analyze_parser_arguments(self):
|
||||
"""Test AnalyzeParser has correct arguments."""
|
||||
main_parser = argparse.ArgumentParser()
|
||||
subparsers = main_parser.add_subparsers(dest='command')
|
||||
subparsers = main_parser.add_subparsers(dest="command")
|
||||
|
||||
from skill_seekers.cli.parsers.analyze_parser import AnalyzeParser
|
||||
|
||||
analyze_parser = AnalyzeParser()
|
||||
analyze_parser.create_parser(subparsers)
|
||||
|
||||
args = main_parser.parse_args(['analyze', '--directory', '.'])
|
||||
assert args.command == 'analyze'
|
||||
assert args.directory == '.'
|
||||
args = main_parser.parse_args(["analyze", "--directory", "."])
|
||||
assert args.command == "analyze"
|
||||
assert args.directory == "."
|
||||
|
||||
args = main_parser.parse_args(['analyze', '--directory', '.', '--quick'])
|
||||
args = main_parser.parse_args(["analyze", "--directory", ".", "--quick"])
|
||||
assert args.quick is True
|
||||
|
||||
args = main_parser.parse_args(['analyze', '--directory', '.', '--comprehensive'])
|
||||
args = main_parser.parse_args(["analyze", "--directory", ".", "--comprehensive"])
|
||||
assert args.comprehensive is True
|
||||
|
||||
args = main_parser.parse_args(['analyze', '--directory', '.', '--skip-patterns'])
|
||||
args = main_parser.parse_args(["analyze", "--directory", ".", "--skip-patterns"])
|
||||
assert args.skip_patterns is True
|
||||
|
||||
|
||||
@@ -215,11 +216,25 @@ class TestBackwardCompatibility:
|
||||
|
||||
# Original commands from old main.py
|
||||
original_commands = [
|
||||
'config', 'scrape', 'github', 'pdf', 'unified',
|
||||
'enhance', 'enhance-status', 'package', 'upload',
|
||||
'estimate', 'extract-test-examples', 'install-agent',
|
||||
'analyze', 'install', 'resume', 'stream',
|
||||
'update', 'multilang', 'quality'
|
||||
"config",
|
||||
"scrape",
|
||||
"github",
|
||||
"pdf",
|
||||
"unified",
|
||||
"enhance",
|
||||
"enhance-status",
|
||||
"package",
|
||||
"upload",
|
||||
"estimate",
|
||||
"extract-test-examples",
|
||||
"install-agent",
|
||||
"analyze",
|
||||
"install",
|
||||
"resume",
|
||||
"stream",
|
||||
"update",
|
||||
"multilang",
|
||||
"quality",
|
||||
]
|
||||
|
||||
for cmd in original_commands:
|
||||
|
||||
@@ -20,18 +20,21 @@ from skill_seekers.cli.storage import (
|
||||
# Check if cloud storage dependencies are available
|
||||
try:
|
||||
import boto3 # noqa: F401
|
||||
|
||||
BOTO3_AVAILABLE = True
|
||||
except ImportError:
|
||||
BOTO3_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from google.cloud import storage # noqa: F401
|
||||
|
||||
GCS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GCS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from azure.storage.blob import BlobServiceClient # noqa: F401
|
||||
|
||||
AZURE_AVAILABLE = True
|
||||
except ImportError:
|
||||
AZURE_AVAILABLE = False
|
||||
@@ -41,12 +44,13 @@ except ImportError:
|
||||
# Factory Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_get_storage_adaptor_s3():
|
||||
"""Test S3 adaptor factory."""
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3'):
|
||||
adaptor = get_storage_adaptor('s3', bucket='test-bucket')
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3"):
|
||||
adaptor = get_storage_adaptor("s3", bucket="test-bucket")
|
||||
assert isinstance(adaptor, S3StorageAdaptor)
|
||||
|
||||
|
||||
@@ -54,8 +58,8 @@ def test_get_storage_adaptor_gcs():
|
||||
"""Test GCS adaptor factory."""
|
||||
if not GCS_AVAILABLE:
|
||||
pytest.skip("google-cloud-storage not installed")
|
||||
with patch('skill_seekers.cli.storage.gcs_storage.storage'):
|
||||
adaptor = get_storage_adaptor('gcs', bucket='test-bucket')
|
||||
with patch("skill_seekers.cli.storage.gcs_storage.storage"):
|
||||
adaptor = get_storage_adaptor("gcs", bucket="test-bucket")
|
||||
assert isinstance(adaptor, GCSStorageAdaptor)
|
||||
|
||||
|
||||
@@ -63,11 +67,11 @@ def test_get_storage_adaptor_azure():
|
||||
"""Test Azure adaptor factory."""
|
||||
if not AZURE_AVAILABLE:
|
||||
pytest.skip("azure-storage-blob not installed")
|
||||
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient'):
|
||||
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient"):
|
||||
adaptor = get_storage_adaptor(
|
||||
'azure',
|
||||
container='test-container',
|
||||
connection_string='DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
|
||||
"azure",
|
||||
container="test-container",
|
||||
connection_string="DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key",
|
||||
)
|
||||
assert isinstance(adaptor, AzureStorageAdaptor)
|
||||
|
||||
@@ -75,36 +79,37 @@ def test_get_storage_adaptor_azure():
|
||||
def test_get_storage_adaptor_invalid_provider():
|
||||
"""Test invalid provider raises error."""
|
||||
with pytest.raises(ValueError, match="Unsupported storage provider"):
|
||||
get_storage_adaptor('invalid', bucket='test')
|
||||
get_storage_adaptor("invalid", bucket="test")
|
||||
|
||||
|
||||
# ========================================
|
||||
# S3 Storage Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_s3_upload_file():
|
||||
"""Test S3 file upload."""
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Create temporary file
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
||||
tmp_file.write(b'test content')
|
||||
tmp_file.write(b"test content")
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
# Test upload
|
||||
result = adaptor.upload_file(tmp_path, 'test.txt')
|
||||
result = adaptor.upload_file(tmp_path, "test.txt")
|
||||
|
||||
assert result == 's3://test-bucket/test.txt'
|
||||
assert result == "s3://test-bucket/test.txt"
|
||||
mock_client.upload_file.assert_called_once()
|
||||
finally:
|
||||
Path(tmp_path).unlink()
|
||||
@@ -115,23 +120,21 @@ def test_s3_download_file():
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
local_path = os.path.join(tmp_dir, 'downloaded.txt')
|
||||
local_path = os.path.join(tmp_dir, "downloaded.txt")
|
||||
|
||||
# Test download
|
||||
adaptor.download_file('test.txt', local_path)
|
||||
adaptor.download_file("test.txt", local_path)
|
||||
|
||||
mock_client.download_file.assert_called_once_with(
|
||||
'test-bucket', 'test.txt', local_path
|
||||
)
|
||||
mock_client.download_file.assert_called_once_with("test-bucket", "test.txt", local_path)
|
||||
|
||||
|
||||
def test_s3_list_files():
|
||||
@@ -139,18 +142,18 @@ def test_s3_list_files():
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_paginator = Mock()
|
||||
mock_page_iterator = [
|
||||
{
|
||||
'Contents': [
|
||||
"Contents": [
|
||||
{
|
||||
'Key': 'file1.txt',
|
||||
'Size': 100,
|
||||
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
|
||||
'ETag': '"abc123"'
|
||||
"Key": "file1.txt",
|
||||
"Size": 100,
|
||||
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
|
||||
"ETag": '"abc123"',
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -161,15 +164,15 @@ def test_s3_list_files():
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Test list
|
||||
files = adaptor.list_files('prefix/')
|
||||
files = adaptor.list_files("prefix/")
|
||||
|
||||
assert len(files) == 1
|
||||
assert files[0].key == 'file1.txt'
|
||||
assert files[0].key == "file1.txt"
|
||||
assert files[0].size == 100
|
||||
assert files[0].etag == 'abc123'
|
||||
assert files[0].etag == "abc123"
|
||||
|
||||
|
||||
def test_s3_file_exists():
|
||||
@@ -177,17 +180,17 @@ def test_s3_file_exists():
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_client.head_object.return_value = {}
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Test exists
|
||||
assert adaptor.file_exists('test.txt') is True
|
||||
assert adaptor.file_exists("test.txt") is True
|
||||
|
||||
|
||||
def test_s3_get_file_url():
|
||||
@@ -195,19 +198,19 @@ def test_s3_get_file_url():
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_client.generate_presigned_url.return_value = 'https://s3.amazonaws.com/signed-url'
|
||||
mock_client.generate_presigned_url.return_value = "https://s3.amazonaws.com/signed-url"
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Test URL generation
|
||||
url = adaptor.get_file_url('test.txt', expires_in=7200)
|
||||
url = adaptor.get_file_url("test.txt", expires_in=7200)
|
||||
|
||||
assert url == 'https://s3.amazonaws.com/signed-url'
|
||||
assert url == "https://s3.amazonaws.com/signed-url"
|
||||
mock_client.generate_presigned_url.assert_called_once()
|
||||
|
||||
|
||||
@@ -215,12 +218,13 @@ def test_s3_get_file_url():
|
||||
# GCS Storage Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_gcs_upload_file():
|
||||
"""Test GCS file upload."""
|
||||
if not GCS_AVAILABLE:
|
||||
pytest.skip("google-cloud-storage not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
|
||||
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_bucket = Mock()
|
||||
@@ -230,18 +234,18 @@ def test_gcs_upload_file():
|
||||
mock_bucket.blob.return_value = mock_blob
|
||||
mock_storage.Client.return_value = mock_client
|
||||
|
||||
adaptor = GCSStorageAdaptor(bucket='test-bucket')
|
||||
adaptor = GCSStorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Create temporary file
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
||||
tmp_file.write(b'test content')
|
||||
tmp_file.write(b"test content")
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
# Test upload
|
||||
result = adaptor.upload_file(tmp_path, 'test.txt')
|
||||
result = adaptor.upload_file(tmp_path, "test.txt")
|
||||
|
||||
assert result == 'gs://test-bucket/test.txt'
|
||||
assert result == "gs://test-bucket/test.txt"
|
||||
mock_blob.upload_from_filename.assert_called_once()
|
||||
finally:
|
||||
Path(tmp_path).unlink()
|
||||
@@ -252,7 +256,7 @@ def test_gcs_download_file():
|
||||
if not GCS_AVAILABLE:
|
||||
pytest.skip("google-cloud-storage not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
|
||||
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_bucket = Mock()
|
||||
@@ -262,13 +266,13 @@ def test_gcs_download_file():
|
||||
mock_bucket.blob.return_value = mock_blob
|
||||
mock_storage.Client.return_value = mock_client
|
||||
|
||||
adaptor = GCSStorageAdaptor(bucket='test-bucket')
|
||||
adaptor = GCSStorageAdaptor(bucket="test-bucket")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
local_path = os.path.join(tmp_dir, 'downloaded.txt')
|
||||
local_path = os.path.join(tmp_dir, "downloaded.txt")
|
||||
|
||||
# Test download
|
||||
adaptor.download_file('test.txt', local_path)
|
||||
adaptor.download_file("test.txt", local_path)
|
||||
|
||||
mock_blob.download_to_filename.assert_called_once()
|
||||
|
||||
@@ -278,27 +282,27 @@ def test_gcs_list_files():
|
||||
if not GCS_AVAILABLE:
|
||||
pytest.skip("google-cloud-storage not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
|
||||
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_blob = Mock()
|
||||
mock_blob.name = 'file1.txt'
|
||||
mock_blob.name = "file1.txt"
|
||||
mock_blob.size = 100
|
||||
mock_blob.updated = Mock(isoformat=lambda: '2024-01-01T00:00:00')
|
||||
mock_blob.etag = 'abc123'
|
||||
mock_blob.updated = Mock(isoformat=lambda: "2024-01-01T00:00:00")
|
||||
mock_blob.etag = "abc123"
|
||||
mock_blob.metadata = {}
|
||||
|
||||
mock_client.list_blobs.return_value = [mock_blob]
|
||||
mock_storage.Client.return_value = mock_client
|
||||
mock_client.bucket.return_value = Mock()
|
||||
|
||||
adaptor = GCSStorageAdaptor(bucket='test-bucket')
|
||||
adaptor = GCSStorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Test list
|
||||
files = adaptor.list_files('prefix/')
|
||||
files = adaptor.list_files("prefix/")
|
||||
|
||||
assert len(files) == 1
|
||||
assert files[0].key == 'file1.txt'
|
||||
assert files[0].key == "file1.txt"
|
||||
assert files[0].size == 100
|
||||
|
||||
|
||||
@@ -306,12 +310,13 @@ def test_gcs_list_files():
|
||||
# Azure Storage Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_azure_upload_file():
|
||||
"""Test Azure file upload."""
|
||||
if not AZURE_AVAILABLE:
|
||||
pytest.skip("azure-storage-blob not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
|
||||
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
|
||||
# Setup mocks
|
||||
mock_service_client = Mock()
|
||||
mock_container_client = Mock()
|
||||
@@ -321,19 +326,21 @@ def test_azure_upload_file():
|
||||
mock_container_client.get_blob_client.return_value = mock_blob_client
|
||||
mock_blob_service.from_connection_string.return_value = mock_service_client
|
||||
|
||||
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
|
||||
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
|
||||
adaptor = AzureStorageAdaptor(
|
||||
container="test-container", connection_string=connection_string
|
||||
)
|
||||
|
||||
# Create temporary file
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
||||
tmp_file.write(b'test content')
|
||||
tmp_file.write(b"test content")
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
# Test upload
|
||||
result = adaptor.upload_file(tmp_path, 'test.txt')
|
||||
result = adaptor.upload_file(tmp_path, "test.txt")
|
||||
|
||||
assert 'test.blob.core.windows.net' in result
|
||||
assert "test.blob.core.windows.net" in result
|
||||
mock_blob_client.upload_blob.assert_called_once()
|
||||
finally:
|
||||
Path(tmp_path).unlink()
|
||||
@@ -344,30 +351,32 @@ def test_azure_download_file():
|
||||
if not AZURE_AVAILABLE:
|
||||
pytest.skip("azure-storage-blob not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
|
||||
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
|
||||
# Setup mocks
|
||||
mock_service_client = Mock()
|
||||
mock_container_client = Mock()
|
||||
mock_blob_client = Mock()
|
||||
mock_download_stream = Mock()
|
||||
mock_download_stream.readall.return_value = b'test content'
|
||||
mock_download_stream.readall.return_value = b"test content"
|
||||
|
||||
mock_service_client.get_container_client.return_value = mock_container_client
|
||||
mock_container_client.get_blob_client.return_value = mock_blob_client
|
||||
mock_blob_client.download_blob.return_value = mock_download_stream
|
||||
mock_blob_service.from_connection_string.return_value = mock_service_client
|
||||
|
||||
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
|
||||
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
|
||||
adaptor = AzureStorageAdaptor(
|
||||
container="test-container", connection_string=connection_string
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
local_path = os.path.join(tmp_dir, 'downloaded.txt')
|
||||
local_path = os.path.join(tmp_dir, "downloaded.txt")
|
||||
|
||||
# Test download
|
||||
adaptor.download_file('test.txt', local_path)
|
||||
adaptor.download_file("test.txt", local_path)
|
||||
|
||||
assert Path(local_path).exists()
|
||||
assert Path(local_path).read_bytes() == b'test content'
|
||||
assert Path(local_path).read_bytes() == b"test content"
|
||||
|
||||
|
||||
def test_azure_list_files():
|
||||
@@ -375,29 +384,31 @@ def test_azure_list_files():
|
||||
if not AZURE_AVAILABLE:
|
||||
pytest.skip("azure-storage-blob not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
|
||||
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
|
||||
# Setup mocks
|
||||
mock_service_client = Mock()
|
||||
mock_container_client = Mock()
|
||||
mock_blob = Mock()
|
||||
mock_blob.name = 'file1.txt'
|
||||
mock_blob.name = "file1.txt"
|
||||
mock_blob.size = 100
|
||||
mock_blob.last_modified = Mock(isoformat=lambda: '2024-01-01T00:00:00')
|
||||
mock_blob.etag = 'abc123'
|
||||
mock_blob.last_modified = Mock(isoformat=lambda: "2024-01-01T00:00:00")
|
||||
mock_blob.etag = "abc123"
|
||||
mock_blob.metadata = {}
|
||||
|
||||
mock_container_client.list_blobs.return_value = [mock_blob]
|
||||
mock_service_client.get_container_client.return_value = mock_container_client
|
||||
mock_blob_service.from_connection_string.return_value = mock_service_client
|
||||
|
||||
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
|
||||
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
|
||||
adaptor = AzureStorageAdaptor(
|
||||
container="test-container", connection_string=connection_string
|
||||
)
|
||||
|
||||
# Test list
|
||||
files = adaptor.list_files('prefix/')
|
||||
files = adaptor.list_files("prefix/")
|
||||
|
||||
assert len(files) == 1
|
||||
assert files[0].key == 'file1.txt'
|
||||
assert files[0].key == "file1.txt"
|
||||
assert files[0].size == 100
|
||||
|
||||
|
||||
@@ -405,53 +416,55 @@ def test_azure_list_files():
|
||||
# Base Adaptor Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_storage_object():
|
||||
"""Test StorageObject dataclass."""
|
||||
obj = StorageObject(
|
||||
key='test.txt',
|
||||
key="test.txt",
|
||||
size=100,
|
||||
last_modified='2024-01-01T00:00:00',
|
||||
etag='abc123',
|
||||
metadata={'key': 'value'}
|
||||
last_modified="2024-01-01T00:00:00",
|
||||
etag="abc123",
|
||||
metadata={"key": "value"},
|
||||
)
|
||||
|
||||
assert obj.key == 'test.txt'
|
||||
assert obj.key == "test.txt"
|
||||
assert obj.size == 100
|
||||
assert obj.metadata == {'key': 'value'}
|
||||
assert obj.metadata == {"key": "value"}
|
||||
|
||||
|
||||
def test_base_adaptor_abstract():
|
||||
"""Test that BaseStorageAdaptor cannot be instantiated."""
|
||||
with pytest.raises(TypeError):
|
||||
BaseStorageAdaptor(bucket='test')
|
||||
BaseStorageAdaptor(bucket="test")
|
||||
|
||||
|
||||
# ========================================
|
||||
# Integration-style Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_upload_directory():
|
||||
"""Test directory upload."""
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
# Create temporary directory with files
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
(Path(tmp_dir) / 'file1.txt').write_text('content1')
|
||||
(Path(tmp_dir) / 'file2.txt').write_text('content2')
|
||||
(Path(tmp_dir) / 'subdir').mkdir()
|
||||
(Path(tmp_dir) / 'subdir' / 'file3.txt').write_text('content3')
|
||||
(Path(tmp_dir) / "file1.txt").write_text("content1")
|
||||
(Path(tmp_dir) / "file2.txt").write_text("content2")
|
||||
(Path(tmp_dir) / "subdir").mkdir()
|
||||
(Path(tmp_dir) / "subdir" / "file3.txt").write_text("content3")
|
||||
|
||||
# Test upload directory
|
||||
uploaded_files = adaptor.upload_directory(tmp_dir, 'skills/')
|
||||
uploaded_files = adaptor.upload_directory(tmp_dir, "skills/")
|
||||
|
||||
assert len(uploaded_files) == 3
|
||||
assert mock_client.upload_file.call_count == 3
|
||||
@@ -462,25 +475,25 @@ def test_download_directory():
|
||||
if not BOTO3_AVAILABLE:
|
||||
pytest.skip("boto3 not installed")
|
||||
|
||||
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
|
||||
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
|
||||
# Setup mocks
|
||||
mock_client = Mock()
|
||||
mock_paginator = Mock()
|
||||
mock_page_iterator = [
|
||||
{
|
||||
'Contents': [
|
||||
"Contents": [
|
||||
{
|
||||
'Key': 'skills/file1.txt',
|
||||
'Size': 100,
|
||||
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
|
||||
'ETag': '"abc"'
|
||||
"Key": "skills/file1.txt",
|
||||
"Size": 100,
|
||||
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
|
||||
"ETag": '"abc"',
|
||||
},
|
||||
{
|
||||
'Key': 'skills/file2.txt',
|
||||
'Size': 200,
|
||||
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
|
||||
'ETag': '"def"'
|
||||
}
|
||||
"Key": "skills/file2.txt",
|
||||
"Size": 200,
|
||||
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
|
||||
"ETag": '"def"',
|
||||
},
|
||||
]
|
||||
}
|
||||
]
|
||||
@@ -490,11 +503,11 @@ def test_download_directory():
|
||||
mock_boto3.client.return_value = mock_client
|
||||
mock_boto3.resource.return_value = Mock()
|
||||
|
||||
adaptor = S3StorageAdaptor(bucket='test-bucket')
|
||||
adaptor = S3StorageAdaptor(bucket="test-bucket")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# Test download directory
|
||||
downloaded_files = adaptor.download_directory('skills/', tmp_dir)
|
||||
downloaded_files = adaptor.download_directory("skills/", tmp_dir)
|
||||
|
||||
assert len(downloaded_files) == 2
|
||||
assert mock_client.download_file.call_count == 2
|
||||
|
||||
@@ -23,6 +23,7 @@ from skill_seekers.embedding.cache import EmbeddingCache
|
||||
# Cache Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_cache_init():
|
||||
"""Test cache initialization."""
|
||||
cache = EmbeddingCache(":memory:")
|
||||
@@ -121,6 +122,7 @@ def test_cache_context_manager():
|
||||
# Generator Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_generator_init():
|
||||
"""Test generator initialization."""
|
||||
generator = EmbeddingGenerator()
|
||||
@@ -174,7 +176,7 @@ def test_generator_compute_hash():
|
||||
assert hash1 != hash4
|
||||
|
||||
|
||||
@patch('skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE', False)
|
||||
@patch("skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE", False)
|
||||
def test_generator_sentence_transformers_not_available():
|
||||
"""Test sentence-transformers not available."""
|
||||
generator = EmbeddingGenerator()
|
||||
@@ -183,7 +185,7 @@ def test_generator_sentence_transformers_not_available():
|
||||
generator.generate("test", model="all-MiniLM-L6-v2")
|
||||
|
||||
|
||||
@patch('skill_seekers.embedding.generator.OPENAI_AVAILABLE', False)
|
||||
@patch("skill_seekers.embedding.generator.OPENAI_AVAILABLE", False)
|
||||
def test_generator_openai_not_available():
|
||||
"""Test OpenAI not available."""
|
||||
generator = EmbeddingGenerator()
|
||||
@@ -192,7 +194,7 @@ def test_generator_openai_not_available():
|
||||
generator.generate("test", model="text-embedding-3-small")
|
||||
|
||||
|
||||
@patch('skill_seekers.embedding.generator.VOYAGE_AVAILABLE', False)
|
||||
@patch("skill_seekers.embedding.generator.VOYAGE_AVAILABLE", False)
|
||||
def test_generator_voyage_not_available():
|
||||
"""Test Voyage AI not available."""
|
||||
generator = EmbeddingGenerator()
|
||||
@@ -227,13 +229,10 @@ def test_generator_voyage_large_2_model_info():
|
||||
# Model Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_embedding_request():
|
||||
"""Test EmbeddingRequest model."""
|
||||
request = EmbeddingRequest(
|
||||
text="Hello world",
|
||||
model="text-embedding-3-small",
|
||||
normalize=True
|
||||
)
|
||||
request = EmbeddingRequest(text="Hello world", model="text-embedding-3-small", normalize=True)
|
||||
|
||||
assert request.text == "Hello world"
|
||||
assert request.model == "text-embedding-3-small"
|
||||
@@ -243,9 +242,7 @@ def test_embedding_request():
|
||||
def test_batch_embedding_request():
|
||||
"""Test BatchEmbeddingRequest model."""
|
||||
request = BatchEmbeddingRequest(
|
||||
texts=["text1", "text2", "text3"],
|
||||
model="text-embedding-3-small",
|
||||
batch_size=32
|
||||
texts=["text1", "text2", "text3"], model="text-embedding-3-small", batch_size=32
|
||||
)
|
||||
|
||||
assert len(request.texts) == 3
|
||||
@@ -255,10 +252,7 @@ def test_batch_embedding_request():
|
||||
def test_embedding_response():
|
||||
"""Test EmbeddingResponse model."""
|
||||
response = EmbeddingResponse(
|
||||
embedding=[0.1, 0.2, 0.3],
|
||||
model="test-model",
|
||||
dimensions=3,
|
||||
cached=False
|
||||
embedding=[0.1, 0.2, 0.3], model="test-model", dimensions=3, cached=False
|
||||
)
|
||||
|
||||
assert len(response.embedding) == 3
|
||||
@@ -273,7 +267,7 @@ def test_batch_embedding_response():
|
||||
model="test-model",
|
||||
dimensions=2,
|
||||
count=2,
|
||||
cached_count=1
|
||||
cached_count=1,
|
||||
)
|
||||
|
||||
assert len(response.embeddings) == 2
|
||||
@@ -288,7 +282,7 @@ def test_health_response():
|
||||
version="1.0.0",
|
||||
models=["model1", "model2"],
|
||||
cache_enabled=True,
|
||||
cache_size=100
|
||||
cache_size=100,
|
||||
)
|
||||
|
||||
assert response.status == "ok"
|
||||
@@ -303,7 +297,7 @@ def test_model_info():
|
||||
provider="openai",
|
||||
dimensions=1536,
|
||||
max_tokens=8191,
|
||||
cost_per_million=0.02
|
||||
cost_per_million=0.02,
|
||||
)
|
||||
|
||||
assert info.name == "test-model"
|
||||
@@ -315,6 +309,7 @@ def test_model_info():
|
||||
# Integration Tests
|
||||
# ========================================
|
||||
|
||||
|
||||
def test_cache_batch_operations():
|
||||
"""Test cache batch operations."""
|
||||
cache = EmbeddingCache(":memory:")
|
||||
|
||||
@@ -23,7 +23,7 @@ from skill_seekers.cli.embedding_pipeline import (
|
||||
EmbeddingPipeline,
|
||||
LocalEmbeddingProvider,
|
||||
EmbeddingCache,
|
||||
CostTracker
|
||||
CostTracker,
|
||||
)
|
||||
|
||||
|
||||
@@ -112,21 +112,16 @@ def test_cost_tracker():
|
||||
|
||||
stats = tracker.get_stats()
|
||||
|
||||
assert stats['total_requests'] == 2
|
||||
assert stats['total_tokens'] == 1500
|
||||
assert stats['cache_hits'] == 1
|
||||
assert stats['cache_misses'] == 1
|
||||
assert '50.0%' in stats['cache_rate']
|
||||
assert stats["total_requests"] == 2
|
||||
assert stats["total_tokens"] == 1500
|
||||
assert stats["cache_hits"] == 1
|
||||
assert stats["cache_misses"] == 1
|
||||
assert "50.0%" in stats["cache_rate"]
|
||||
|
||||
|
||||
def test_pipeline_initialization():
|
||||
"""Test pipeline initialization."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=128,
|
||||
batch_size=10
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=128, batch_size=10)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -137,12 +132,7 @@ def test_pipeline_initialization():
|
||||
|
||||
def test_pipeline_generate_batch():
|
||||
"""Test batch embedding generation."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=64,
|
||||
batch_size=2
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=64, batch_size=2)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -159,11 +149,11 @@ def test_pipeline_caching():
|
||||
"""Test pipeline uses caching."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
provider="local",
|
||||
model="test-model",
|
||||
dimension=32,
|
||||
batch_size=10,
|
||||
cache_dir=Path(tmpdir)
|
||||
cache_dir=Path(tmpdir),
|
||||
)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
@@ -184,10 +174,10 @@ def test_pipeline_caching():
|
||||
def test_pipeline_batch_processing():
|
||||
"""Test large batch is processed in chunks."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
provider="local",
|
||||
model="test-model",
|
||||
dimension=16,
|
||||
batch_size=3 # Small batch size
|
||||
batch_size=3, # Small batch size
|
||||
)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
@@ -201,11 +191,7 @@ def test_pipeline_batch_processing():
|
||||
|
||||
def test_validate_dimensions_valid():
|
||||
"""Test dimension validation with valid embeddings."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=128
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -217,11 +203,7 @@ def test_validate_dimensions_valid():
|
||||
|
||||
def test_validate_dimensions_invalid():
|
||||
"""Test dimension validation with invalid embeddings."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=128
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -234,30 +216,22 @@ def test_validate_dimensions_invalid():
|
||||
|
||||
def test_embedding_result_metadata():
|
||||
"""Test embedding result includes metadata."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=256
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=256)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
texts = ["test"]
|
||||
result = pipeline.generate_batch(texts, show_progress=False)
|
||||
|
||||
assert 'provider' in result.metadata
|
||||
assert 'model' in result.metadata
|
||||
assert 'dimension' in result.metadata
|
||||
assert result.metadata['dimension'] == 256
|
||||
assert "provider" in result.metadata
|
||||
assert "model" in result.metadata
|
||||
assert "dimension" in result.metadata
|
||||
assert result.metadata["dimension"] == 256
|
||||
|
||||
|
||||
def test_cost_stats():
|
||||
"""Test cost statistics tracking."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=64
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=64)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -266,18 +240,14 @@ def test_cost_stats():
|
||||
|
||||
stats = pipeline.get_cost_stats()
|
||||
|
||||
assert 'total_requests' in stats
|
||||
assert 'cache_hits' in stats
|
||||
assert 'estimated_cost' in stats
|
||||
assert "total_requests" in stats
|
||||
assert "cache_hits" in stats
|
||||
assert "estimated_cost" in stats
|
||||
|
||||
|
||||
def test_empty_batch():
|
||||
"""Test handling empty batch."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=32
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=32)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -289,11 +259,7 @@ def test_empty_batch():
|
||||
|
||||
def test_single_document():
|
||||
"""Test single document generation."""
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=128
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
|
||||
@@ -306,11 +272,7 @@ def test_single_document():
|
||||
def test_different_dimensions():
|
||||
"""Test different embedding dimensions."""
|
||||
for dim in [64, 128, 256, 512]:
|
||||
config = EmbeddingConfig(
|
||||
provider='local',
|
||||
model='test-model',
|
||||
dimension=dim
|
||||
)
|
||||
config = EmbeddingConfig(provider="local", model="test-model", dimension=dim)
|
||||
|
||||
pipeline = EmbeddingPipeline(config)
|
||||
result = pipeline.generate_batch(["test"], show_progress=False)
|
||||
|
||||
@@ -152,9 +152,7 @@ class TestMultiAgentSupport:
|
||||
|
||||
def test_rejects_missing_executable(self, tmp_path, monkeypatch):
|
||||
"""Test rejection when executable is not found on PATH."""
|
||||
monkeypatch.setattr(
|
||||
"skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None
|
||||
)
|
||||
monkeypatch.setattr("skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None)
|
||||
skill_dir = _make_skill_dir(tmp_path)
|
||||
|
||||
with pytest.raises(ValueError, match="not found in PATH"):
|
||||
|
||||
@@ -80,8 +80,9 @@ class TestFrameworkDetection(unittest.TestCase):
|
||||
arch_data = json.load(f)
|
||||
|
||||
self.assertIn("frameworks_detected", arch_data)
|
||||
self.assertIn("Flask", arch_data["frameworks_detected"],
|
||||
"Flask should be detected from imports")
|
||||
self.assertIn(
|
||||
"Flask", arch_data["frameworks_detected"], "Flask should be detected from imports"
|
||||
)
|
||||
|
||||
def test_files_with_imports_are_included(self):
|
||||
"""Test that files with only imports are included in analysis (Issue #239)."""
|
||||
@@ -119,24 +120,19 @@ class TestFrameworkDetection(unittest.TestCase):
|
||||
analysis_data = json.load(f)
|
||||
|
||||
# File should be included
|
||||
self.assertGreater(len(analysis_data["files"]), 0,
|
||||
"Files with imports should be included")
|
||||
self.assertGreater(len(analysis_data["files"]), 0, "Files with imports should be included")
|
||||
|
||||
# Find our import-only file
|
||||
import_file = next(
|
||||
(f for f in analysis_data["files"] if "imports_only.py" in f["file"]),
|
||||
None
|
||||
(f for f in analysis_data["files"] if "imports_only.py" in f["file"]), None
|
||||
)
|
||||
self.assertIsNotNone(import_file, "Import-only file should be in analysis")
|
||||
|
||||
# Verify imports were extracted
|
||||
self.assertIn("imports", import_file, "Imports should be extracted")
|
||||
self.assertGreater(len(import_file["imports"]), 0,
|
||||
"Should have captured imports")
|
||||
self.assertIn("django", import_file["imports"],
|
||||
"Django import should be captured")
|
||||
self.assertIn("flask", import_file["imports"],
|
||||
"Flask import should be captured")
|
||||
self.assertGreater(len(import_file["imports"]), 0, "Should have captured imports")
|
||||
self.assertIn("django", import_file["imports"], "Django import should be captured")
|
||||
self.assertIn("flask", import_file["imports"], "Flask import should be captured")
|
||||
|
||||
def test_no_false_positive_frameworks(self):
|
||||
"""Test that framework detection doesn't produce false positives (Issue #239)."""
|
||||
@@ -145,10 +141,7 @@ class TestFrameworkDetection(unittest.TestCase):
|
||||
app_dir.mkdir()
|
||||
|
||||
# File with no framework imports
|
||||
(app_dir / "utils.py").write_text(
|
||||
"def my_function():\n"
|
||||
" return 'hello'\n"
|
||||
)
|
||||
(app_dir / "utils.py").write_text("def my_function():\n return 'hello'\n")
|
||||
|
||||
# Run codebase analyzer
|
||||
from skill_seekers.cli.codebase_scraper import main as scraper_main
|
||||
@@ -180,12 +173,10 @@ class TestFrameworkDetection(unittest.TestCase):
|
||||
|
||||
frameworks = arch_data.get("frameworks_detected", [])
|
||||
# Should not detect Flask just from "app" directory name
|
||||
self.assertNotIn("Flask", frameworks,
|
||||
"Should not detect Flask without imports")
|
||||
self.assertNotIn("Flask", frameworks, "Should not detect Flask without imports")
|
||||
# Should not detect other frameworks with "app" in markers
|
||||
for fw in ["ASP.NET", "Rails", "Laravel"]:
|
||||
self.assertNotIn(fw, frameworks,
|
||||
f"Should not detect {fw} without real evidence")
|
||||
self.assertNotIn(fw, frameworks, f"Should not detect {fw} without real evidence")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -20,9 +20,7 @@ import time
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from skill_seekers.cli.incremental_updater import (
|
||||
IncrementalUpdater
|
||||
)
|
||||
from skill_seekers.cli.incremental_updater import IncrementalUpdater
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -281,15 +279,15 @@ def test_apply_update_package(temp_skill_dir):
|
||||
"timestamp": "2026-02-05T12:00:00",
|
||||
"skill_name": "test_skill",
|
||||
"change_summary": {"modified": 1},
|
||||
"total_changes": 1
|
||||
"total_changes": 1,
|
||||
},
|
||||
"changes": {
|
||||
"SKILL.md": {
|
||||
"action": "modify",
|
||||
"version": 2,
|
||||
"content": "# Updated Content\n\nApplied from package"
|
||||
"content": "# Updated Content\n\nApplied from package",
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
package_path.write_text(json.dumps(update_data))
|
||||
@@ -298,7 +296,9 @@ def test_apply_update_package(temp_skill_dir):
|
||||
success = updater.apply_update_package(package_path)
|
||||
|
||||
assert success
|
||||
assert (temp_skill_dir / "SKILL.md").read_text() == "# Updated Content\n\nApplied from package"
|
||||
assert (
|
||||
temp_skill_dir / "SKILL.md"
|
||||
).read_text() == "# Updated Content\n\nApplied from package"
|
||||
|
||||
|
||||
def test_content_hash_consistency(temp_skill_dir):
|
||||
|
||||
@@ -92,7 +92,11 @@ class TestConfigLoading(unittest.TestCase):
|
||||
{
|
||||
"type": "documentation",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
|
||||
"selectors": {
|
||||
"main_content": "article",
|
||||
"title": "h1",
|
||||
"code_blocks": "pre code",
|
||||
},
|
||||
"rate_limit": 0.5,
|
||||
"max_pages": 100,
|
||||
}
|
||||
|
||||
@@ -113,6 +113,7 @@ def check_service_available(url: str, timeout: int = 5) -> bool:
|
||||
"""Check if a service is available."""
|
||||
try:
|
||||
import requests
|
||||
|
||||
response = requests.get(url, timeout=timeout)
|
||||
return response.status_code == 200
|
||||
except Exception:
|
||||
@@ -133,7 +134,9 @@ class TestWeaviateIntegration:
|
||||
|
||||
# Check if Weaviate is running
|
||||
if not check_service_available("http://localhost:8080/v1/.well-known/ready"):
|
||||
pytest.skip("Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
|
||||
pytest.skip(
|
||||
"Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
|
||||
)
|
||||
|
||||
# Connect to Weaviate
|
||||
try:
|
||||
@@ -144,10 +147,7 @@ class TestWeaviateIntegration:
|
||||
|
||||
# Package skill
|
||||
adaptor = get_adaptor("weaviate")
|
||||
SkillMetadata(
|
||||
name="integration_test",
|
||||
description="Integration test skill for Weaviate"
|
||||
)
|
||||
SkillMetadata(name="integration_test", description="Integration test skill for Weaviate")
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
assert package_path.exists(), "Package not created"
|
||||
@@ -173,19 +173,16 @@ class TestWeaviateIntegration:
|
||||
with client.batch as batch:
|
||||
for obj in data["objects"]:
|
||||
batch.add_data_object(
|
||||
data_object=obj["properties"],
|
||||
class_name=class_name,
|
||||
uuid=obj["id"]
|
||||
data_object=obj["properties"], class_name=class_name, uuid=obj["id"]
|
||||
)
|
||||
|
||||
# Wait for indexing
|
||||
time.sleep(1)
|
||||
|
||||
# Query - Get all objects
|
||||
result = client.query.get(
|
||||
class_name,
|
||||
["content", "source", "category"]
|
||||
).with_limit(10).do()
|
||||
result = (
|
||||
client.query.get(class_name, ["content", "source", "category"]).with_limit(10).do()
|
||||
)
|
||||
|
||||
# Verify results
|
||||
assert "data" in result, "Query returned no data"
|
||||
@@ -203,8 +200,9 @@ class TestWeaviateIntegration:
|
||||
|
||||
# Verify content
|
||||
contents = [obj["content"] for obj in objects]
|
||||
assert any("vector" in content.lower() for content in contents), \
|
||||
assert any("vector" in content.lower() for content in contents), (
|
||||
"Expected content not found"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Cleanup - Delete collection
|
||||
@@ -234,7 +232,7 @@ class TestWeaviateIntegration:
|
||||
description="Test metadata preservation",
|
||||
version="2.0.0",
|
||||
author="Integration Test Suite",
|
||||
tags=["test", "integration", "weaviate"]
|
||||
tags=["test", "integration", "weaviate"],
|
||||
)
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
@@ -249,18 +247,17 @@ class TestWeaviateIntegration:
|
||||
with client.batch as batch:
|
||||
for obj in data["objects"]:
|
||||
batch.add_data_object(
|
||||
data_object=obj["properties"],
|
||||
class_name=class_name,
|
||||
uuid=obj["id"]
|
||||
data_object=obj["properties"], class_name=class_name, uuid=obj["id"]
|
||||
)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# Query and verify metadata
|
||||
result = client.query.get(
|
||||
class_name,
|
||||
["source", "version", "author", "tags"]
|
||||
).with_limit(1).do()
|
||||
result = (
|
||||
client.query.get(class_name, ["source", "version", "author", "tags"])
|
||||
.with_limit(1)
|
||||
.do()
|
||||
)
|
||||
|
||||
obj = result["data"]["Get"][class_name][0]
|
||||
assert obj["source"] == "metadata_test", "Source not preserved"
|
||||
@@ -287,7 +284,9 @@ class TestChromaIntegration:
|
||||
|
||||
# Check if Chroma is running
|
||||
if not check_service_available("http://localhost:8000/api/v1/heartbeat"):
|
||||
pytest.skip("ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
|
||||
pytest.skip(
|
||||
"ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
|
||||
)
|
||||
|
||||
# Connect to ChromaDB
|
||||
try:
|
||||
@@ -299,8 +298,7 @@ class TestChromaIntegration:
|
||||
# Package skill
|
||||
adaptor = get_adaptor("chroma")
|
||||
SkillMetadata(
|
||||
name="chroma_integration_test",
|
||||
description="Integration test skill for ChromaDB"
|
||||
name="chroma_integration_test", description="Integration test skill for ChromaDB"
|
||||
)
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
@@ -326,9 +324,7 @@ class TestChromaIntegration:
|
||||
|
||||
# Add documents
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
|
||||
)
|
||||
|
||||
# Wait for indexing
|
||||
@@ -340,8 +336,7 @@ class TestChromaIntegration:
|
||||
# Verify results
|
||||
assert "documents" in results, "Query returned no documents"
|
||||
assert len(results["documents"]) > 0, "No documents returned"
|
||||
assert len(results["documents"]) == len(data["documents"]), \
|
||||
"Document count mismatch"
|
||||
assert len(results["documents"]) == len(data["documents"]), "Document count mismatch"
|
||||
|
||||
# Verify metadata
|
||||
assert "metadatas" in results, "Query returned no metadatas"
|
||||
@@ -350,8 +345,9 @@ class TestChromaIntegration:
|
||||
assert "category" in first_metadata, "Missing category in metadata"
|
||||
|
||||
# Verify content
|
||||
assert any("vector" in doc.lower() for doc in results["documents"]), \
|
||||
assert any("vector" in doc.lower() for doc in results["documents"]), (
|
||||
"Expected content not found"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Cleanup - Delete collection
|
||||
@@ -377,8 +373,7 @@ class TestChromaIntegration:
|
||||
# Package and upload
|
||||
adaptor = get_adaptor("chroma")
|
||||
metadata = SkillMetadata(
|
||||
name="chroma_filter_test",
|
||||
description="Test filtering capabilities"
|
||||
name="chroma_filter_test", description="Test filtering capabilities"
|
||||
)
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
@@ -390,23 +385,18 @@ class TestChromaIntegration:
|
||||
try:
|
||||
collection = client.get_or_create_collection(name=collection_name)
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
|
||||
)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# Query with category filter
|
||||
results = collection.get(
|
||||
where={"category": "getting started"}
|
||||
)
|
||||
results = collection.get(where={"category": "getting started"})
|
||||
|
||||
# Verify filtering worked
|
||||
assert len(results["documents"]) > 0, "No documents matched filter"
|
||||
for metadata in results["metadatas"]:
|
||||
assert metadata["category"] == "getting started", \
|
||||
"Filter returned wrong category"
|
||||
assert metadata["category"] == "getting started", "Filter returned wrong category"
|
||||
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
@@ -428,7 +418,9 @@ class TestQdrantIntegration:
|
||||
|
||||
# Check if Qdrant is running
|
||||
if not check_service_available("http://localhost:6333/"):
|
||||
pytest.skip("Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
|
||||
pytest.skip(
|
||||
"Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
|
||||
)
|
||||
|
||||
# Connect to Qdrant
|
||||
try:
|
||||
@@ -440,8 +432,7 @@ class TestQdrantIntegration:
|
||||
# Package skill
|
||||
adaptor = get_adaptor("qdrant")
|
||||
SkillMetadata(
|
||||
name="qdrant_integration_test",
|
||||
description="Integration test skill for Qdrant"
|
||||
name="qdrant_integration_test", description="Integration test skill for Qdrant"
|
||||
)
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
@@ -465,25 +456,21 @@ class TestQdrantIntegration:
|
||||
# Create collection
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
|
||||
)
|
||||
|
||||
# Upload points (with placeholder vectors for testing)
|
||||
points = []
|
||||
for point in data["points"]:
|
||||
points.append(PointStruct(
|
||||
id=point["id"],
|
||||
vector=[0.0] * vector_size, # Placeholder vectors
|
||||
payload=point["payload"]
|
||||
))
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point["id"],
|
||||
vector=[0.0] * vector_size, # Placeholder vectors
|
||||
payload=point["payload"],
|
||||
)
|
||||
)
|
||||
|
||||
client.upsert(
|
||||
collection_name=collection_name,
|
||||
points=points
|
||||
)
|
||||
client.upsert(collection_name=collection_name, points=points)
|
||||
|
||||
# Wait for indexing
|
||||
time.sleep(1)
|
||||
@@ -493,14 +480,10 @@ class TestQdrantIntegration:
|
||||
|
||||
# Verify collection
|
||||
assert collection_info.points_count > 0, "No points in collection"
|
||||
assert collection_info.points_count == len(data["points"]), \
|
||||
"Point count mismatch"
|
||||
assert collection_info.points_count == len(data["points"]), "Point count mismatch"
|
||||
|
||||
# Query - Scroll through points
|
||||
scroll_result = client.scroll(
|
||||
collection_name=collection_name,
|
||||
limit=10
|
||||
)
|
||||
scroll_result = client.scroll(collection_name=collection_name, limit=10)
|
||||
|
||||
points_list = scroll_result[0]
|
||||
assert len(points_list) > 0, "No points returned"
|
||||
@@ -514,8 +497,9 @@ class TestQdrantIntegration:
|
||||
|
||||
# Verify content
|
||||
contents = [p.payload["content"] for p in points_list]
|
||||
assert any("vector" in content.lower() for content in contents), \
|
||||
assert any("vector" in content.lower() for content in contents), (
|
||||
"Expected content not found"
|
||||
)
|
||||
|
||||
finally:
|
||||
# Cleanup - Delete collection
|
||||
@@ -527,8 +511,12 @@ class TestQdrantIntegration:
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import (
|
||||
Distance, VectorParams, PointStruct,
|
||||
Filter, FieldCondition, MatchValue
|
||||
Distance,
|
||||
VectorParams,
|
||||
PointStruct,
|
||||
Filter,
|
||||
FieldCondition,
|
||||
MatchValue,
|
||||
)
|
||||
except ImportError:
|
||||
pytest.skip("qdrant-client not installed")
|
||||
@@ -544,10 +532,7 @@ class TestQdrantIntegration:
|
||||
|
||||
# Package and upload
|
||||
adaptor = get_adaptor("qdrant")
|
||||
SkillMetadata(
|
||||
name="qdrant_filter_test",
|
||||
description="Test filtering capabilities"
|
||||
)
|
||||
SkillMetadata(name="qdrant_filter_test", description="Test filtering capabilities")
|
||||
package_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
with open(package_path) as f:
|
||||
@@ -560,19 +545,16 @@ class TestQdrantIntegration:
|
||||
# Create and upload
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=VectorParams(
|
||||
size=vector_size,
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
|
||||
)
|
||||
|
||||
points = []
|
||||
for point in data["points"]:
|
||||
points.append(PointStruct(
|
||||
id=point["id"],
|
||||
vector=[0.0] * vector_size,
|
||||
payload=point["payload"]
|
||||
))
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point["id"], vector=[0.0] * vector_size, payload=point["payload"]
|
||||
)
|
||||
)
|
||||
|
||||
client.upsert(collection_name=collection_name, points=points)
|
||||
time.sleep(1)
|
||||
@@ -581,14 +563,9 @@ class TestQdrantIntegration:
|
||||
scroll_result = client.scroll(
|
||||
collection_name=collection_name,
|
||||
scroll_filter=Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="type",
|
||||
match=MatchValue(value="reference")
|
||||
)
|
||||
]
|
||||
must=[FieldCondition(key="type", match=MatchValue(value="reference"))]
|
||||
),
|
||||
limit=10
|
||||
limit=10,
|
||||
)
|
||||
|
||||
points_list = scroll_result[0]
|
||||
@@ -596,8 +573,7 @@ class TestQdrantIntegration:
|
||||
# Verify filtering worked
|
||||
assert len(points_list) > 0, "No points matched filter"
|
||||
for point in points_list:
|
||||
assert point.payload["type"] == "reference", \
|
||||
"Filter returned wrong type"
|
||||
assert point.payload["type"] == "reference", "Filter returned wrong type"
|
||||
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
@@ -607,4 +583,5 @@ class TestQdrantIntegration:
|
||||
if __name__ == "__main__":
|
||||
# Run integration tests
|
||||
import sys
|
||||
|
||||
sys.exit(pytest.main([__file__, "-v", "-m", "integration"]))
|
||||
|
||||
@@ -192,9 +192,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
|
||||
|
||||
# Verify converted URLs are valid
|
||||
# In real scenario, these would be added to pending_urls and scraped
|
||||
self.assertTrue(
|
||||
len(converted_urls) > 0, "Should generate at least one URL to scrape"
|
||||
)
|
||||
self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
|
||||
|
||||
# Verify no URLs would cause 404 (no anchors in middle of path)
|
||||
for url in converted_urls:
|
||||
|
||||
@@ -464,13 +464,15 @@ class TestValidateConfigTool(unittest.IsolatedAsyncioTestCase):
|
||||
valid_config = {
|
||||
"name": "valid-test",
|
||||
"description": "Test configuration",
|
||||
"sources": [{
|
||||
"type": "documentation",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
|
||||
"rate_limit": 0.5,
|
||||
"max_pages": 100,
|
||||
}],
|
||||
"sources": [
|
||||
{
|
||||
"type": "documentation",
|
||||
"base_url": "https://example.com/",
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
|
||||
"rate_limit": 0.5,
|
||||
"max_pages": 100,
|
||||
}
|
||||
],
|
||||
}
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(valid_config, f)
|
||||
|
||||
@@ -19,10 +19,7 @@ import json
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from skill_seekers.cli.multilang_support import (
|
||||
LanguageDetector,
|
||||
MultiLanguageManager
|
||||
)
|
||||
from skill_seekers.cli.multilang_support import LanguageDetector, MultiLanguageManager
|
||||
|
||||
|
||||
def test_detect_english():
|
||||
@@ -32,8 +29,8 @@ def test_detect_english():
|
||||
text = "This is an English document. It contains common English words."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'en'
|
||||
assert lang_info.name == 'English'
|
||||
assert lang_info.code == "en"
|
||||
assert lang_info.name == "English"
|
||||
assert lang_info.confidence > 0.0
|
||||
|
||||
|
||||
@@ -44,8 +41,8 @@ def test_detect_spanish():
|
||||
text = "Este es un documento en español. Contiene palabras comunes en español."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'es'
|
||||
assert lang_info.name == 'Spanish'
|
||||
assert lang_info.code == "es"
|
||||
assert lang_info.name == "Spanish"
|
||||
|
||||
|
||||
def test_detect_french():
|
||||
@@ -55,8 +52,8 @@ def test_detect_french():
|
||||
text = "Ceci est un document en français. Il contient des mots français communs."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'fr'
|
||||
assert lang_info.name == 'French'
|
||||
assert lang_info.code == "fr"
|
||||
assert lang_info.name == "French"
|
||||
|
||||
|
||||
def test_detect_german():
|
||||
@@ -66,8 +63,8 @@ def test_detect_german():
|
||||
text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter."
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'de'
|
||||
assert lang_info.name == 'German'
|
||||
assert lang_info.code == "de"
|
||||
assert lang_info.name == "German"
|
||||
|
||||
|
||||
def test_detect_chinese():
|
||||
@@ -77,33 +74,33 @@ def test_detect_chinese():
|
||||
text = "这是一个中文文档。它包含常见的中文字符。"
|
||||
lang_info = detector.detect(text)
|
||||
|
||||
assert lang_info.code == 'zh'
|
||||
assert lang_info.name == 'Chinese'
|
||||
assert lang_info.code == "zh"
|
||||
assert lang_info.name == "Chinese"
|
||||
|
||||
|
||||
def test_detect_from_filename_dot_pattern():
|
||||
"""Test language detection from filename (file.en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README.en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide.es.md") == 'es'
|
||||
assert detector.detect_from_filename("doc.fr.md") == 'fr'
|
||||
assert detector.detect_from_filename("README.en.md") == "en"
|
||||
assert detector.detect_from_filename("guide.es.md") == "es"
|
||||
assert detector.detect_from_filename("doc.fr.md") == "fr"
|
||||
|
||||
|
||||
def test_detect_from_filename_underscore_pattern():
|
||||
"""Test language detection from filename (file_en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README_en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide_es.md") == 'es'
|
||||
assert detector.detect_from_filename("README_en.md") == "en"
|
||||
assert detector.detect_from_filename("guide_es.md") == "es"
|
||||
|
||||
|
||||
def test_detect_from_filename_dash_pattern():
|
||||
"""Test language detection from filename (file-en.md pattern)."""
|
||||
detector = LanguageDetector()
|
||||
|
||||
assert detector.detect_from_filename("README-en.md") == 'en'
|
||||
assert detector.detect_from_filename("guide-es.md") == 'es'
|
||||
assert detector.detect_from_filename("README-en.md") == "en"
|
||||
assert detector.detect_from_filename("guide-es.md") == "es"
|
||||
|
||||
|
||||
def test_detect_from_filename_no_match():
|
||||
@@ -118,15 +115,11 @@ def test_add_document_single_language():
|
||||
"""Test adding documents in single language."""
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
manager.add_document(
|
||||
"README.md",
|
||||
"This is an English document.",
|
||||
{"category": "overview"}
|
||||
)
|
||||
manager.add_document("README.md", "This is an English document.", {"category": "overview"})
|
||||
|
||||
assert len(manager.get_languages()) == 1
|
||||
assert 'en' in manager.get_languages()
|
||||
assert manager.get_document_count('en') == 1
|
||||
assert "en" in manager.get_languages()
|
||||
assert manager.get_document_count("en") == 1
|
||||
|
||||
|
||||
def test_add_document_multiple_languages():
|
||||
@@ -138,9 +131,9 @@ def test_add_document_multiple_languages():
|
||||
manager.add_document("README.fr.md", "Ceci est français.", {})
|
||||
|
||||
assert len(manager.get_languages()) == 3
|
||||
assert 'en' in manager.get_languages()
|
||||
assert 'es' in manager.get_languages()
|
||||
assert 'fr' in manager.get_languages()
|
||||
assert "en" in manager.get_languages()
|
||||
assert "es" in manager.get_languages()
|
||||
assert "fr" in manager.get_languages()
|
||||
|
||||
|
||||
def test_force_language():
|
||||
@@ -148,15 +141,10 @@ def test_force_language():
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
# Force Spanish despite English content
|
||||
manager.add_document(
|
||||
"file.md",
|
||||
"This is actually English content.",
|
||||
{},
|
||||
force_language='es'
|
||||
)
|
||||
manager.add_document("file.md", "This is actually English content.", {}, force_language="es")
|
||||
|
||||
assert 'es' in manager.get_languages()
|
||||
assert manager.get_document_count('es') == 1
|
||||
assert "es" in manager.get_languages()
|
||||
assert manager.get_document_count("es") == 1
|
||||
|
||||
|
||||
def test_filename_language_priority():
|
||||
@@ -164,14 +152,10 @@ def test_filename_language_priority():
|
||||
manager = MultiLanguageManager()
|
||||
|
||||
# Filename says Spanish, but content is English
|
||||
manager.add_document(
|
||||
"guide.es.md",
|
||||
"This is English content.",
|
||||
{}
|
||||
)
|
||||
manager.add_document("guide.es.md", "This is English content.", {})
|
||||
|
||||
# Should use filename language
|
||||
assert 'es' in manager.get_languages()
|
||||
assert "es" in manager.get_languages()
|
||||
|
||||
|
||||
def test_document_count_all():
|
||||
@@ -183,8 +167,8 @@ def test_document_count_all():
|
||||
manager.add_document("file3.es.md", "Spanish doc", {})
|
||||
|
||||
assert manager.get_document_count() == 3
|
||||
assert manager.get_document_count('en') == 2
|
||||
assert manager.get_document_count('es') == 1
|
||||
assert manager.get_document_count("en") == 2
|
||||
assert manager.get_document_count("es") == 1
|
||||
|
||||
|
||||
def test_primary_language():
|
||||
@@ -195,7 +179,7 @@ def test_primary_language():
|
||||
manager.add_document("file2.es.md", "Spanish doc", {})
|
||||
|
||||
# Primary should be first added
|
||||
assert manager.primary_language == 'en'
|
||||
assert manager.primary_language == "en"
|
||||
|
||||
|
||||
def test_translation_status():
|
||||
@@ -208,9 +192,9 @@ def test_translation_status():
|
||||
|
||||
status = manager.get_translation_status()
|
||||
|
||||
assert status.source_language == 'en'
|
||||
assert 'es' in status.translated_languages
|
||||
assert 'fr' in status.translated_languages
|
||||
assert status.source_language == "en"
|
||||
assert "es" in status.translated_languages
|
||||
assert "fr" in status.translated_languages
|
||||
assert len(status.translated_languages) == 2
|
||||
|
||||
|
||||
@@ -225,17 +209,17 @@ def test_export_by_language():
|
||||
exports = manager.export_by_language(Path(tmpdir))
|
||||
|
||||
assert len(exports) == 2
|
||||
assert 'en' in exports
|
||||
assert 'es' in exports
|
||||
assert "en" in exports
|
||||
assert "es" in exports
|
||||
|
||||
# Check files exist
|
||||
assert exports['en'].exists()
|
||||
assert exports['es'].exists()
|
||||
assert exports["en"].exists()
|
||||
assert exports["es"].exists()
|
||||
|
||||
# Check content
|
||||
en_data = json.loads(exports['en'].read_text())
|
||||
assert en_data['language'] == 'en'
|
||||
assert en_data['document_count'] == 1
|
||||
en_data = json.loads(exports["en"].read_text())
|
||||
assert en_data["language"] == "en"
|
||||
assert en_data["document_count"] == 1
|
||||
|
||||
|
||||
def test_translation_report_generation():
|
||||
@@ -268,11 +252,11 @@ def test_script_detection():
|
||||
|
||||
# English uses Latin script
|
||||
en_info = detector.detect("This is English")
|
||||
assert en_info.script == 'Latin'
|
||||
assert en_info.script == "Latin"
|
||||
|
||||
# Chinese uses Han script
|
||||
zh_info = detector.detect("这是中文")
|
||||
assert zh_info.script == 'Han'
|
||||
assert zh_info.script == "Han"
|
||||
|
||||
|
||||
def test_confidence_scoring():
|
||||
@@ -283,7 +267,7 @@ def test_confidence_scoring():
|
||||
strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English."
|
||||
lang_info = detector.detect(strong_en)
|
||||
|
||||
assert lang_info.code == 'en'
|
||||
assert lang_info.code == "en"
|
||||
assert lang_info.confidence > 0.3 # Should have decent confidence
|
||||
|
||||
|
||||
@@ -294,9 +278,9 @@ def test_metadata_preservation():
|
||||
metadata = {"category": "guide", "version": "1.0"}
|
||||
manager.add_document("file.md", "English content", metadata)
|
||||
|
||||
docs = manager.documents['en']
|
||||
docs = manager.documents["en"]
|
||||
assert len(docs) == 1
|
||||
assert docs[0]['metadata'] == metadata
|
||||
assert docs[0]["metadata"] == metadata
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -14,9 +14,9 @@ class TestPresetDefinitions:
|
||||
|
||||
def test_all_presets_defined(self):
|
||||
"""Test that all expected presets are defined."""
|
||||
assert 'quick' in PRESETS
|
||||
assert 'standard' in PRESETS
|
||||
assert 'comprehensive' in PRESETS
|
||||
assert "quick" in PRESETS
|
||||
assert "standard" in PRESETS
|
||||
assert "comprehensive" in PRESETS
|
||||
assert len(PRESETS) == 3
|
||||
|
||||
def test_preset_structure(self):
|
||||
@@ -25,7 +25,7 @@ class TestPresetDefinitions:
|
||||
assert isinstance(preset, AnalysisPreset)
|
||||
assert preset.name
|
||||
assert preset.description
|
||||
assert preset.depth in ['surface', 'deep', 'full']
|
||||
assert preset.depth in ["surface", "deep", "full"]
|
||||
assert isinstance(preset.features, dict)
|
||||
assert 0 <= preset.enhance_level <= 3
|
||||
assert preset.estimated_time
|
||||
@@ -33,45 +33,45 @@ class TestPresetDefinitions:
|
||||
|
||||
def test_quick_preset(self):
|
||||
"""Test quick preset configuration."""
|
||||
quick = PRESETS['quick']
|
||||
assert quick.name == 'Quick'
|
||||
assert quick.depth == 'surface'
|
||||
quick = PRESETS["quick"]
|
||||
assert quick.name == "Quick"
|
||||
assert quick.depth == "surface"
|
||||
assert quick.enhance_level == 0
|
||||
assert quick.estimated_time == '1-2 minutes'
|
||||
assert quick.icon == '⚡'
|
||||
assert quick.estimated_time == "1-2 minutes"
|
||||
assert quick.icon == "⚡"
|
||||
# Quick should disable slow features
|
||||
assert quick.features['api_reference'] # Essential
|
||||
assert not quick.features['dependency_graph'] # Slow
|
||||
assert not quick.features['patterns'] # Slow
|
||||
assert not quick.features['test_examples'] # Slow
|
||||
assert not quick.features['how_to_guides'] # Requires AI
|
||||
assert quick.features['docs'] # Essential
|
||||
assert quick.features["api_reference"] # Essential
|
||||
assert not quick.features["dependency_graph"] # Slow
|
||||
assert not quick.features["patterns"] # Slow
|
||||
assert not quick.features["test_examples"] # Slow
|
||||
assert not quick.features["how_to_guides"] # Requires AI
|
||||
assert quick.features["docs"] # Essential
|
||||
|
||||
def test_standard_preset(self):
|
||||
"""Test standard preset configuration."""
|
||||
standard = PRESETS['standard']
|
||||
assert standard.name == 'Standard'
|
||||
assert standard.depth == 'deep'
|
||||
standard = PRESETS["standard"]
|
||||
assert standard.name == "Standard"
|
||||
assert standard.depth == "deep"
|
||||
assert standard.enhance_level == 1
|
||||
assert standard.estimated_time == '5-10 minutes'
|
||||
assert standard.icon == '🎯'
|
||||
assert standard.estimated_time == "5-10 minutes"
|
||||
assert standard.icon == "🎯"
|
||||
# Standard should enable core features
|
||||
assert standard.features['api_reference']
|
||||
assert standard.features['dependency_graph']
|
||||
assert standard.features['patterns']
|
||||
assert standard.features['test_examples']
|
||||
assert not standard.features['how_to_guides'] # Slow
|
||||
assert standard.features['config_patterns']
|
||||
assert standard.features['docs']
|
||||
assert standard.features["api_reference"]
|
||||
assert standard.features["dependency_graph"]
|
||||
assert standard.features["patterns"]
|
||||
assert standard.features["test_examples"]
|
||||
assert not standard.features["how_to_guides"] # Slow
|
||||
assert standard.features["config_patterns"]
|
||||
assert standard.features["docs"]
|
||||
|
||||
def test_comprehensive_preset(self):
|
||||
"""Test comprehensive preset configuration."""
|
||||
comprehensive = PRESETS['comprehensive']
|
||||
assert comprehensive.name == 'Comprehensive'
|
||||
assert comprehensive.depth == 'full'
|
||||
comprehensive = PRESETS["comprehensive"]
|
||||
assert comprehensive.name == "Comprehensive"
|
||||
assert comprehensive.depth == "full"
|
||||
assert comprehensive.enhance_level == 3
|
||||
assert comprehensive.estimated_time == '20-60 minutes'
|
||||
assert comprehensive.icon == '🚀'
|
||||
assert comprehensive.estimated_time == "20-60 minutes"
|
||||
assert comprehensive.icon == "🚀"
|
||||
# Comprehensive should enable ALL features
|
||||
assert all(comprehensive.features.values())
|
||||
|
||||
@@ -81,44 +81,44 @@ class TestPresetManager:
|
||||
|
||||
def test_get_preset(self):
|
||||
"""Test PresetManager.get_preset()."""
|
||||
quick = PresetManager.get_preset('quick')
|
||||
quick = PresetManager.get_preset("quick")
|
||||
assert quick is not None
|
||||
assert quick.name == 'Quick'
|
||||
assert quick.depth == 'surface'
|
||||
assert quick.name == "Quick"
|
||||
assert quick.depth == "surface"
|
||||
|
||||
# Case insensitive
|
||||
standard = PresetManager.get_preset('STANDARD')
|
||||
standard = PresetManager.get_preset("STANDARD")
|
||||
assert standard is not None
|
||||
assert standard.name == 'Standard'
|
||||
assert standard.name == "Standard"
|
||||
|
||||
def test_get_preset_invalid(self):
|
||||
"""Test PresetManager.get_preset() with invalid name."""
|
||||
invalid = PresetManager.get_preset('nonexistent')
|
||||
invalid = PresetManager.get_preset("nonexistent")
|
||||
assert invalid is None
|
||||
|
||||
def test_list_presets(self):
|
||||
"""Test PresetManager.list_presets()."""
|
||||
presets = PresetManager.list_presets()
|
||||
assert len(presets) == 3
|
||||
assert 'quick' in presets
|
||||
assert 'standard' in presets
|
||||
assert 'comprehensive' in presets
|
||||
assert "quick" in presets
|
||||
assert "standard" in presets
|
||||
assert "comprehensive" in presets
|
||||
|
||||
def test_format_preset_help(self):
|
||||
"""Test PresetManager.format_preset_help()."""
|
||||
help_text = PresetManager.format_preset_help()
|
||||
assert 'Available presets:' in help_text
|
||||
assert '⚡ quick' in help_text
|
||||
assert '🎯 standard' in help_text
|
||||
assert '🚀 comprehensive' in help_text
|
||||
assert '1-2 minutes' in help_text
|
||||
assert '5-10 minutes' in help_text
|
||||
assert '20-60 minutes' in help_text
|
||||
assert "Available presets:" in help_text
|
||||
assert "⚡ quick" in help_text
|
||||
assert "🎯 standard" in help_text
|
||||
assert "🚀 comprehensive" in help_text
|
||||
assert "1-2 minutes" in help_text
|
||||
assert "5-10 minutes" in help_text
|
||||
assert "20-60 minutes" in help_text
|
||||
|
||||
def test_get_default_preset(self):
|
||||
"""Test PresetManager.get_default_preset()."""
|
||||
default = PresetManager.get_default_preset()
|
||||
assert default == 'standard'
|
||||
assert default == "standard"
|
||||
|
||||
|
||||
class TestPresetApplication:
|
||||
@@ -126,85 +126,85 @@ class TestPresetApplication:
|
||||
|
||||
def test_apply_preset_quick(self):
|
||||
"""Test applying quick preset."""
|
||||
args = {'directory': '/tmp/test'}
|
||||
updated = PresetManager.apply_preset('quick', args)
|
||||
args = {"directory": "/tmp/test"}
|
||||
updated = PresetManager.apply_preset("quick", args)
|
||||
|
||||
assert updated['depth'] == 'surface'
|
||||
assert updated['enhance_level'] == 0
|
||||
assert updated['skip_patterns'] # Quick disables patterns
|
||||
assert updated['skip_dependency_graph'] # Quick disables dep graph
|
||||
assert updated['skip_test_examples'] # Quick disables tests
|
||||
assert updated['skip_how_to_guides'] # Quick disables guides
|
||||
assert not updated['skip_api_reference'] # Quick enables API ref
|
||||
assert not updated['skip_docs'] # Quick enables docs
|
||||
assert updated["depth"] == "surface"
|
||||
assert updated["enhance_level"] == 0
|
||||
assert updated["skip_patterns"] # Quick disables patterns
|
||||
assert updated["skip_dependency_graph"] # Quick disables dep graph
|
||||
assert updated["skip_test_examples"] # Quick disables tests
|
||||
assert updated["skip_how_to_guides"] # Quick disables guides
|
||||
assert not updated["skip_api_reference"] # Quick enables API ref
|
||||
assert not updated["skip_docs"] # Quick enables docs
|
||||
|
||||
def test_apply_preset_standard(self):
|
||||
"""Test applying standard preset."""
|
||||
args = {'directory': '/tmp/test'}
|
||||
updated = PresetManager.apply_preset('standard', args)
|
||||
args = {"directory": "/tmp/test"}
|
||||
updated = PresetManager.apply_preset("standard", args)
|
||||
|
||||
assert updated['depth'] == 'deep'
|
||||
assert updated['enhance_level'] == 1
|
||||
assert not updated['skip_patterns'] # Standard enables patterns
|
||||
assert not updated['skip_dependency_graph'] # Standard enables dep graph
|
||||
assert not updated['skip_test_examples'] # Standard enables tests
|
||||
assert updated['skip_how_to_guides'] # Standard disables guides (slow)
|
||||
assert not updated['skip_api_reference'] # Standard enables API ref
|
||||
assert not updated['skip_docs'] # Standard enables docs
|
||||
assert updated["depth"] == "deep"
|
||||
assert updated["enhance_level"] == 1
|
||||
assert not updated["skip_patterns"] # Standard enables patterns
|
||||
assert not updated["skip_dependency_graph"] # Standard enables dep graph
|
||||
assert not updated["skip_test_examples"] # Standard enables tests
|
||||
assert updated["skip_how_to_guides"] # Standard disables guides (slow)
|
||||
assert not updated["skip_api_reference"] # Standard enables API ref
|
||||
assert not updated["skip_docs"] # Standard enables docs
|
||||
|
||||
def test_apply_preset_comprehensive(self):
|
||||
"""Test applying comprehensive preset."""
|
||||
args = {'directory': '/tmp/test'}
|
||||
updated = PresetManager.apply_preset('comprehensive', args)
|
||||
args = {"directory": "/tmp/test"}
|
||||
updated = PresetManager.apply_preset("comprehensive", args)
|
||||
|
||||
assert updated['depth'] == 'full'
|
||||
assert updated['enhance_level'] == 3
|
||||
assert updated["depth"] == "full"
|
||||
assert updated["enhance_level"] == 3
|
||||
# Comprehensive enables ALL features
|
||||
assert not updated['skip_patterns']
|
||||
assert not updated['skip_dependency_graph']
|
||||
assert not updated['skip_test_examples']
|
||||
assert not updated['skip_how_to_guides']
|
||||
assert not updated['skip_api_reference']
|
||||
assert not updated['skip_config_patterns']
|
||||
assert not updated['skip_docs']
|
||||
assert not updated["skip_patterns"]
|
||||
assert not updated["skip_dependency_graph"]
|
||||
assert not updated["skip_test_examples"]
|
||||
assert not updated["skip_how_to_guides"]
|
||||
assert not updated["skip_api_reference"]
|
||||
assert not updated["skip_config_patterns"]
|
||||
assert not updated["skip_docs"]
|
||||
|
||||
def test_cli_overrides_preset(self):
|
||||
"""Test that CLI args override preset defaults."""
|
||||
args = {
|
||||
'directory': '/tmp/test',
|
||||
'enhance_level': 2, # Override preset default
|
||||
'skip_patterns': False # Override preset default
|
||||
"directory": "/tmp/test",
|
||||
"enhance_level": 2, # Override preset default
|
||||
"skip_patterns": False, # Override preset default
|
||||
}
|
||||
|
||||
updated = PresetManager.apply_preset('quick', args)
|
||||
updated = PresetManager.apply_preset("quick", args)
|
||||
|
||||
# Preset says enhance_level=0, but CLI said 2
|
||||
assert updated['enhance_level'] == 2 # CLI wins
|
||||
assert updated["enhance_level"] == 2 # CLI wins
|
||||
|
||||
# Preset says skip_patterns=True (disabled), but CLI said False (enabled)
|
||||
assert not updated['skip_patterns'] # CLI wins
|
||||
assert not updated["skip_patterns"] # CLI wins
|
||||
|
||||
def test_apply_preset_preserves_args(self):
|
||||
"""Test that apply_preset preserves existing args."""
|
||||
args = {
|
||||
'directory': '/tmp/test',
|
||||
'output': 'custom_output/',
|
||||
'languages': 'Python,JavaScript'
|
||||
"directory": "/tmp/test",
|
||||
"output": "custom_output/",
|
||||
"languages": "Python,JavaScript",
|
||||
}
|
||||
|
||||
updated = PresetManager.apply_preset('standard', args)
|
||||
updated = PresetManager.apply_preset("standard", args)
|
||||
|
||||
# Existing args should be preserved
|
||||
assert updated['directory'] == '/tmp/test'
|
||||
assert updated['output'] == 'custom_output/'
|
||||
assert updated['languages'] == 'Python,JavaScript'
|
||||
assert updated["directory"] == "/tmp/test"
|
||||
assert updated["output"] == "custom_output/"
|
||||
assert updated["languages"] == "Python,JavaScript"
|
||||
|
||||
def test_apply_preset_invalid(self):
|
||||
"""Test applying invalid preset raises error."""
|
||||
args = {'directory': '/tmp/test'}
|
||||
args = {"directory": "/tmp/test"}
|
||||
|
||||
with pytest.raises(ValueError, match="Unknown preset: nonexistent"):
|
||||
PresetManager.apply_preset('nonexistent', args)
|
||||
PresetManager.apply_preset("nonexistent", args)
|
||||
|
||||
|
||||
class TestDeprecationWarnings:
|
||||
@@ -215,12 +215,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=True,
|
||||
comprehensive=False,
|
||||
depth=None,
|
||||
ai_mode='auto'
|
||||
)
|
||||
args = argparse.Namespace(quick=True, comprehensive=False, depth=None, ai_mode="auto")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -235,12 +230,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=False,
|
||||
comprehensive=True,
|
||||
depth=None,
|
||||
ai_mode='auto'
|
||||
)
|
||||
args = argparse.Namespace(quick=False, comprehensive=True, depth=None, ai_mode="auto")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -255,12 +245,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=False,
|
||||
comprehensive=False,
|
||||
depth='full',
|
||||
ai_mode='auto'
|
||||
)
|
||||
args = argparse.Namespace(quick=False, comprehensive=False, depth="full", ai_mode="auto")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -275,12 +260,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=False,
|
||||
comprehensive=False,
|
||||
depth=None,
|
||||
ai_mode='api'
|
||||
)
|
||||
args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="api")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -295,12 +275,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=True,
|
||||
comprehensive=False,
|
||||
depth='surface',
|
||||
ai_mode='local'
|
||||
)
|
||||
args = argparse.Namespace(quick=True, comprehensive=False, depth="surface", ai_mode="local")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -317,12 +292,7 @@ class TestDeprecationWarnings:
|
||||
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
|
||||
import argparse
|
||||
|
||||
args = argparse.Namespace(
|
||||
quick=False,
|
||||
comprehensive=False,
|
||||
depth=None,
|
||||
ai_mode='auto'
|
||||
)
|
||||
args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="auto")
|
||||
|
||||
_check_deprecated_flags(args)
|
||||
|
||||
@@ -337,31 +307,31 @@ class TestBackwardCompatibility:
|
||||
def test_old_flags_still_work(self):
|
||||
"""Test that old flags still work (with warnings)."""
|
||||
# --quick flag
|
||||
args = {'quick': True}
|
||||
updated = PresetManager.apply_preset('quick', args)
|
||||
assert updated['depth'] == 'surface'
|
||||
args = {"quick": True}
|
||||
updated = PresetManager.apply_preset("quick", args)
|
||||
assert updated["depth"] == "surface"
|
||||
|
||||
# --comprehensive flag
|
||||
args = {'comprehensive': True}
|
||||
updated = PresetManager.apply_preset('comprehensive', args)
|
||||
assert updated['depth'] == 'full'
|
||||
args = {"comprehensive": True}
|
||||
updated = PresetManager.apply_preset("comprehensive", args)
|
||||
assert updated["depth"] == "full"
|
||||
|
||||
def test_preset_flag_preferred(self):
|
||||
"""Test that --preset flag is the recommended way."""
|
||||
# Using --preset quick
|
||||
args = {'preset': 'quick'}
|
||||
updated = PresetManager.apply_preset('quick', args)
|
||||
assert updated['depth'] == 'surface'
|
||||
args = {"preset": "quick"}
|
||||
updated = PresetManager.apply_preset("quick", args)
|
||||
assert updated["depth"] == "surface"
|
||||
|
||||
# Using --preset standard
|
||||
args = {'preset': 'standard'}
|
||||
updated = PresetManager.apply_preset('standard', args)
|
||||
assert updated['depth'] == 'deep'
|
||||
args = {"preset": "standard"}
|
||||
updated = PresetManager.apply_preset("standard", args)
|
||||
assert updated["depth"] == "deep"
|
||||
|
||||
# Using --preset comprehensive
|
||||
args = {'preset': 'comprehensive'}
|
||||
updated = PresetManager.apply_preset('comprehensive', args)
|
||||
assert updated['depth'] == 'full'
|
||||
args = {"preset": "comprehensive"}
|
||||
updated = PresetManager.apply_preset("comprehensive", args)
|
||||
assert updated["depth"] == "full"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -19,10 +19,7 @@ import tempfile
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from skill_seekers.cli.quality_metrics import (
|
||||
QualityAnalyzer,
|
||||
MetricLevel
|
||||
)
|
||||
from skill_seekers.cli.quality_metrics import QualityAnalyzer, MetricLevel
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -176,9 +173,9 @@ def test_calculate_statistics(complete_skill_dir):
|
||||
analyzer = QualityAnalyzer(complete_skill_dir)
|
||||
stats = analyzer.calculate_statistics()
|
||||
|
||||
assert stats['total_files'] > 0
|
||||
assert stats['markdown_files'] > 0
|
||||
assert stats['total_words'] > 0
|
||||
assert stats["total_files"] > 0
|
||||
assert stats["markdown_files"] > 0
|
||||
assert stats["total_words"] > 0
|
||||
|
||||
|
||||
def test_overall_score_calculation():
|
||||
@@ -197,9 +194,7 @@ def test_overall_score_calculation():
|
||||
coverage = 70.0
|
||||
health = 85.0
|
||||
|
||||
overall = analyzer.calculate_overall_score(
|
||||
completeness, accuracy, coverage, health
|
||||
)
|
||||
overall = analyzer.calculate_overall_score(completeness, accuracy, coverage, health)
|
||||
|
||||
assert overall.completeness == 80.0
|
||||
assert overall.accuracy == 90.0
|
||||
@@ -218,13 +213,13 @@ def test_grade_assignment():
|
||||
|
||||
# Test various scores
|
||||
score_95 = analyzer.calculate_overall_score(95, 95, 95, 95)
|
||||
assert score_95.grade == 'A+'
|
||||
assert score_95.grade == "A+"
|
||||
|
||||
score_85 = analyzer.calculate_overall_score(85, 85, 85, 85)
|
||||
assert score_85.grade in ['A-', 'B+']
|
||||
assert score_85.grade in ["A-", "B+"]
|
||||
|
||||
score_70 = analyzer.calculate_overall_score(70, 70, 70, 70)
|
||||
assert score_70.grade in ['B-', 'C+', 'C']
|
||||
assert score_70.grade in ["B-", "C+", "C"]
|
||||
|
||||
|
||||
def test_generate_recommendations():
|
||||
@@ -240,7 +235,7 @@ def test_generate_recommendations():
|
||||
recommendations = analyzer.generate_recommendations(score)
|
||||
|
||||
assert len(recommendations) > 0
|
||||
assert any('completeness' in r.lower() for r in recommendations)
|
||||
assert any("completeness" in r.lower() for r in recommendations)
|
||||
|
||||
|
||||
def test_generate_report(complete_skill_dir):
|
||||
|
||||
@@ -28,7 +28,7 @@ class TestRAGChunker:
|
||||
chunk_overlap=100,
|
||||
preserve_code_blocks=False,
|
||||
preserve_paragraphs=False,
|
||||
min_chunk_size=50
|
||||
min_chunk_size=50,
|
||||
)
|
||||
|
||||
assert chunker.chunk_size == 1024
|
||||
@@ -180,13 +180,17 @@ class TestRAGChunker:
|
||||
|
||||
# Create SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
skill_md.write_text("# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs.")
|
||||
skill_md.write_text(
|
||||
"# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs."
|
||||
)
|
||||
|
||||
# Create references directory with files
|
||||
references_dir = skill_dir / "references"
|
||||
references_dir.mkdir()
|
||||
|
||||
(references_dir / "getting_started.md").write_text("# Getting Started\n\nQuick start guide.")
|
||||
(references_dir / "getting_started.md").write_text(
|
||||
"# Getting Started\n\nQuick start guide."
|
||||
)
|
||||
(references_dir / "api.md").write_text("# API Reference\n\nAPI documentation.")
|
||||
|
||||
# Chunk skill
|
||||
@@ -209,7 +213,7 @@ class TestRAGChunker:
|
||||
{
|
||||
"chunk_id": "test_0",
|
||||
"page_content": "Test content",
|
||||
"metadata": {"source": "test", "chunk_index": 0}
|
||||
"metadata": {"source": "test", "chunk_index": 0},
|
||||
}
|
||||
]
|
||||
|
||||
@@ -340,7 +344,7 @@ class TestRAGChunker:
|
||||
metadata = {
|
||||
"source": "react-docs",
|
||||
"category": "hooks",
|
||||
"url": "https://react.dev/reference/react"
|
||||
"url": "https://react.dev/reference/react",
|
||||
}
|
||||
|
||||
chunks = chunker.chunk_document(text, metadata)
|
||||
@@ -379,10 +383,7 @@ class TestRAGChunkerIntegration:
|
||||
|
||||
# Convert to LangChain Documents
|
||||
docs = [
|
||||
Document(
|
||||
page_content=chunk["page_content"],
|
||||
metadata=chunk["metadata"]
|
||||
)
|
||||
Document(page_content=chunk["page_content"], metadata=chunk["metadata"])
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
@@ -407,11 +408,7 @@ class TestRAGChunkerIntegration:
|
||||
|
||||
# Convert to LlamaIndex TextNodes
|
||||
nodes = [
|
||||
TextNode(
|
||||
text=chunk["page_content"],
|
||||
metadata=chunk["metadata"],
|
||||
id_=chunk["chunk_id"]
|
||||
)
|
||||
TextNode(text=chunk["page_content"], metadata=chunk["metadata"], id_=chunk["chunk_id"])
|
||||
for chunk in chunks
|
||||
]
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ pytest.importorskip("mcp.server")
|
||||
# Check if starlette is available
|
||||
try:
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
STARLETTE_AVAILABLE = True
|
||||
except ImportError:
|
||||
STARLETTE_AVAILABLE = False
|
||||
@@ -21,8 +22,7 @@ from skill_seekers.mcp.server_fastmcp import mcp
|
||||
|
||||
# Skip all tests if starlette is not installed
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not STARLETTE_AVAILABLE,
|
||||
reason="starlette not installed (pip install starlette httpx)"
|
||||
not STARLETTE_AVAILABLE, reason="starlette not installed (pip install starlette httpx)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user