style: Format all Python files with ruff

- Formatted 103 files to comply with ruff format requirements
- No code logic changes, only formatting/whitespace
- Fixes CI formatting check failures
This commit is contained in:
yusyus
2026-02-08 14:42:27 +03:00
parent 6e4f623b9d
commit 0265de5816
103 changed files with 2241 additions and 2627 deletions

View File

@@ -33,9 +33,9 @@ from .runner import BenchmarkRunner
from .models import BenchmarkReport, Metric
__all__ = [
'Benchmark',
'BenchmarkResult',
'BenchmarkRunner',
'BenchmarkReport',
'Metric',
"Benchmark",
"BenchmarkResult",
"BenchmarkRunner",
"BenchmarkReport",
"Metric",
]

View File

@@ -11,12 +11,7 @@ from typing import Any
from collections.abc import Callable
from pathlib import Path
from .models import (
Metric,
TimingResult,
MemoryUsage,
BenchmarkReport
)
from .models import Metric, TimingResult, MemoryUsage, BenchmarkReport
class BenchmarkResult:
@@ -97,7 +92,7 @@ class BenchmarkResult:
memory=self.memory,
metrics=self.metrics,
system_info=self.system_info,
recommendations=self.recommendations
recommendations=self.recommendations,
)
@@ -161,7 +156,7 @@ class Benchmark:
operation=operation,
duration=duration,
iterations=iterations,
avg_duration=duration / iterations if iterations > 1 else duration
avg_duration=duration / iterations if iterations > 1 else duration,
)
self.result.add_timing(timing)
@@ -201,7 +196,7 @@ class Benchmark:
before_mb=mem_before,
after_mb=mem_after,
peak_mb=peak_memory,
allocated_mb=mem_after - mem_before
allocated_mb=mem_after - mem_before,
)
self.result.add_memory(usage)
@@ -212,7 +207,7 @@ class Benchmark:
*args,
operation: str | None = None,
track_memory: bool = False,
**kwargs
**kwargs,
) -> Any:
"""
Measure function execution.
@@ -260,17 +255,16 @@ class Benchmark:
def load_config(path):
return json.load(open(path))
"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
return self.measure(
func,
*args,
operation=operation,
track_memory=track_memory,
**kwargs
func, *args, operation=operation, track_memory=track_memory, **kwargs
)
return wrapper
return decorator
def metric(self, name: str, value: float, unit: str):
@@ -285,11 +279,7 @@ class Benchmark:
Examples:
benchmark.metric("pages_per_sec", 12.5, "pages/sec")
"""
metric = Metric(
name=name,
value=value,
unit=unit
)
metric = Metric(name=name, value=value, unit=unit)
self.result.add_metric(metric)
def recommend(self, text: str):
@@ -328,7 +318,7 @@ class Benchmark:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(report.model_dump_json(indent=2))
def analyze(self):
@@ -339,11 +329,7 @@ class Benchmark:
"""
# Analyze timing bottlenecks
if self.result.timings:
sorted_timings = sorted(
self.result.timings,
key=lambda t: t.duration,
reverse=True
)
sorted_timings = sorted(self.result.timings, key=lambda t: t.duration, reverse=True)
slowest = sorted_timings[0]
total_time = sum(t.duration for t in self.result.timings)
@@ -351,7 +337,7 @@ class Benchmark:
if slowest.duration > total_time * 0.5:
self.recommend(
f"Bottleneck: '{slowest.operation}' takes "
f"{slowest.duration:.1f}s ({slowest.duration/total_time*100:.0f}% of total)"
f"{slowest.duration:.1f}s ({slowest.duration / total_time * 100:.0f}% of total)"
)
# Analyze memory usage
@@ -360,8 +346,7 @@ class Benchmark:
if peak > 1000: # >1GB
self.recommend(
f"High memory usage: {peak:.0f}MB peak. "
"Consider processing in batches."
f"High memory usage: {peak:.0f}MB peak. Consider processing in batches."
)
# Check for memory leaks

View File

@@ -14,8 +14,7 @@ class Metric(BaseModel):
value: float = Field(..., description="Metric value")
unit: str = Field(..., description="Unit (seconds, bytes, pages/sec, etc.)")
timestamp: datetime = Field(
default_factory=datetime.utcnow,
description="When metric was recorded"
default_factory=datetime.utcnow, description="When metric was recorded"
)
@@ -48,26 +47,13 @@ class BenchmarkReport(BaseModel):
finished_at: datetime = Field(..., description="Finish time")
total_duration: float = Field(..., description="Total duration in seconds")
timings: list[TimingResult] = Field(
default_factory=list,
description="Timing results"
)
memory: list[MemoryUsage] = Field(
default_factory=list,
description="Memory usage results"
)
metrics: list[Metric] = Field(
default_factory=list,
description="Additional metrics"
)
timings: list[TimingResult] = Field(default_factory=list, description="Timing results")
memory: list[MemoryUsage] = Field(default_factory=list, description="Memory usage results")
metrics: list[Metric] = Field(default_factory=list, description="Additional metrics")
system_info: dict[str, Any] = Field(
default_factory=dict,
description="System information"
)
system_info: dict[str, Any] = Field(default_factory=dict, description="System information")
recommendations: list[str] = Field(
default_factory=list,
description="Optimization recommendations"
default_factory=list, description="Optimization recommendations"
)
@property
@@ -89,14 +75,8 @@ class ComparisonReport(BaseModel):
baseline: BenchmarkReport = Field(..., description="Baseline benchmark")
current: BenchmarkReport = Field(..., description="Current benchmark")
improvements: list[str] = Field(
default_factory=list,
description="Performance improvements"
)
regressions: list[str] = Field(
default_factory=list,
description="Performance regressions"
)
improvements: list[str] = Field(default_factory=list, description="Performance improvements")
regressions: list[str] = Field(default_factory=list, description="Performance regressions")
speedup_factor: float = Field(..., description="Overall speedup factor")
memory_change_mb: float = Field(..., description="Memory usage change (MB)")

View File

@@ -46,10 +46,7 @@ class BenchmarkRunner:
self.output_dir.mkdir(parents=True, exist_ok=True)
def run(
self,
name: str,
benchmark_func: Callable[[Benchmark], None],
save: bool = True
self, name: str, benchmark_func: Callable[[Benchmark], None], save: bool = True
) -> BenchmarkReport:
"""
Run single benchmark.
@@ -83,7 +80,7 @@ class BenchmarkRunner:
filename = f"{name}_{timestamp}.json"
path = self.output_dir / filename
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(report.model_dump_json(indent=2))
print(f"📊 Saved benchmark: {path}")
@@ -91,9 +88,7 @@ class BenchmarkRunner:
return report
def run_suite(
self,
benchmarks: dict[str, Callable[[Benchmark], None]],
save: bool = True
self, benchmarks: dict[str, Callable[[Benchmark], None]], save: bool = True
) -> dict[str, BenchmarkReport]:
"""
Run multiple benchmarks.
@@ -122,11 +117,7 @@ class BenchmarkRunner:
return reports
def compare(
self,
baseline_path: Path,
current_path: Path
) -> ComparisonReport:
def compare(self, baseline_path: Path, current_path: Path) -> ComparisonReport:
"""
Compare two benchmark reports.
@@ -215,7 +206,7 @@ class BenchmarkRunner:
improvements=improvements,
regressions=regressions,
speedup_factor=speedup_factor,
memory_change_mb=memory_change_mb
memory_change_mb=memory_change_mb,
)
def list_benchmarks(self) -> list[dict[str, Any]]:
@@ -237,13 +228,15 @@ class BenchmarkRunner:
with open(path) as f:
data = json.load(f)
benchmarks.append({
"name": data["name"],
"path": str(path),
"started_at": data["started_at"],
"duration": data["total_duration"],
"operations": len(data.get("timings", []))
})
benchmarks.append(
{
"name": data["name"],
"path": str(path),
"started_at": data["started_at"],
"duration": data["total_duration"],
"operations": len(data.get("timings", [])),
}
)
except Exception:
# Skip invalid files
continue

View File

@@ -74,7 +74,7 @@ class SkillAdaptor(ABC):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill for platform (ZIP, tar.gz, etc.).
@@ -282,7 +282,7 @@ class SkillAdaptor(ABC):
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
source_file: str = None
source_file: str = None,
) -> list[tuple[str, dict]]:
"""
Optionally chunk content for RAG platforms.
@@ -326,33 +326,31 @@ class SkillAdaptor(ABC):
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
preserve_code_blocks=preserve_code_blocks,
preserve_paragraphs=True,
min_chunk_size=100 # 100 tokens minimum
min_chunk_size=100, # 100 tokens minimum
)
# Chunk the document
chunks = chunker.chunk_document(
text=content,
metadata=metadata,
source_file=source_file or metadata.get('file', 'unknown')
source_file=source_file or metadata.get("file", "unknown"),
)
# Convert RAGChunker output format to (text, metadata) tuples
result = []
for chunk_dict in chunks:
chunk_text = chunk_dict['page_content']
chunk_text = chunk_dict["page_content"]
chunk_meta = {
**metadata, # Base metadata
**chunk_dict['metadata'], # RAGChunker metadata (chunk_index, etc.)
'is_chunked': True,
'chunk_id': chunk_dict['chunk_id']
**chunk_dict["metadata"], # RAGChunker metadata (chunk_index, etc.)
"is_chunked": True,
"chunk_id": chunk_dict["chunk_id"],
}
result.append((chunk_text, chunk_meta))
return result
def _format_output_path(
self, skill_dir: Path, output_path: Path, suffix: str
) -> Path:
def _format_output_path(self, skill_dir: Path, output_path: Path, suffix: str) -> Path:
"""
Generate standardized output path with intelligent format handling.
@@ -379,11 +377,13 @@ class SkillAdaptor(ABC):
output_str = str(output_path)
# Extract the file extension from suffix (e.g., ".json" from "-langchain.json")
correct_ext = suffix.split('.')[-1] if '.' in suffix else ''
correct_ext = suffix.split(".")[-1] if "." in suffix else ""
if correct_ext and not output_str.endswith(f".{correct_ext}"):
# Replace common incorrect extensions
output_str = output_str.replace(".zip", f".{correct_ext}").replace(".tar.gz", f".{correct_ext}")
output_str = output_str.replace(".zip", f".{correct_ext}").replace(
".tar.gz", f".{correct_ext}"
)
# Ensure platform suffix is present
if not output_str.endswith(suffix):
@@ -395,9 +395,7 @@ class SkillAdaptor(ABC):
return Path(output_str)
def _generate_deterministic_id(
self, content: str, metadata: dict, format: str = "hex"
) -> str:
def _generate_deterministic_id(self, content: str, metadata: dict, format: str = "hex") -> str:
"""
Generate deterministic ID from content and metadata.

View File

@@ -43,11 +43,7 @@ class ChromaAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for Chroma ingestion.
@@ -90,9 +86,9 @@ class ChromaAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to parallel arrays
@@ -120,9 +116,9 @@ class ChromaAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to parallel arrays
@@ -149,7 +145,7 @@ class ChromaAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Chroma.
@@ -183,7 +179,7 @@ class ChromaAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file
@@ -233,7 +229,7 @@ class ChromaAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "chromadb not installed. Run: pip install chromadb"
"message": "chromadb not installed. Run: pip install chromadb",
}
# Load package
@@ -241,8 +237,8 @@ class ChromaAdaptor(SkillAdaptor):
data = json.load(f)
# Determine client type and configuration
persist_directory = kwargs.get('persist_directory')
chroma_url = kwargs.get('chroma_url')
persist_directory = kwargs.get("persist_directory")
chroma_url = kwargs.get("chroma_url")
try:
if persist_directory:
@@ -253,15 +249,15 @@ class ChromaAdaptor(SkillAdaptor):
# Remote HTTP client
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
# Parse URL
if '://' in chroma_url:
parts = chroma_url.split('://')
if "://" in chroma_url:
parts = chroma_url.split("://")
parts[0]
host_port = parts[1]
else:
host_port = chroma_url
if ':' in host_port:
host, port = host_port.rsplit(':', 1)
if ":" in host_port:
host, port = host_port.rsplit(":", 1)
port = int(port)
else:
host = host_port
@@ -276,12 +272,12 @@ class ChromaAdaptor(SkillAdaptor):
except Exception as e:
return {
"success": False,
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server"
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server",
}
# Get or create collection
collection_name = kwargs.get('collection_name', data.get('collection_name', 'skill_docs'))
distance_function = kwargs.get('distance_function', 'cosine')
collection_name = kwargs.get("collection_name", data.get("collection_name", "skill_docs"))
distance_function = kwargs.get("distance_function", "cosine")
try:
# Try to get existing collection
@@ -291,62 +287,57 @@ class ChromaAdaptor(SkillAdaptor):
try:
# Create new collection
metadata = {"hnsw:space": distance_function}
collection = client.create_collection(
name=collection_name,
metadata=metadata
)
collection = client.create_collection(name=collection_name, metadata=metadata)
print(f"✅ Created collection: {collection_name} (distance: {distance_function})")
except Exception as e:
return {
"success": False,
"message": f"Failed to create collection '{collection_name}': {e}"
"message": f"Failed to create collection '{collection_name}': {e}",
}
# Handle embeddings
embedding_function = kwargs.get('embedding_function')
embedding_function = kwargs.get("embedding_function")
try:
if embedding_function == 'openai':
if embedding_function == "openai":
# Generate embeddings with OpenAI
print("🔄 Generating OpenAI embeddings...")
embeddings = self._generate_openai_embeddings(
data['documents'],
api_key=kwargs.get('openai_api_key')
data["documents"], api_key=kwargs.get("openai_api_key")
)
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids'],
embeddings=embeddings
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"],
embeddings=embeddings,
)
elif embedding_function == 'sentence-transformers':
elif embedding_function == "sentence-transformers":
# Use sentence-transformers
print("🔄 Generating sentence-transformer embeddings...")
try:
from chromadb.utils import embedding_functions
ef = embedding_functions.SentenceTransformerEmbeddingFunction()
embeddings = [ef([doc])[0] for doc in data['documents']]
embeddings = [ef([doc])[0] for doc in data["documents"]]
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids'],
embeddings=embeddings
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"],
embeddings=embeddings,
)
except ImportError:
return {
"success": False,
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
}
else:
# No embeddings - Chroma will auto-generate
print("🔄 Using Chroma's default embedding function...")
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids']
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
)
count = len(data['documents'])
count = len(data["documents"])
print(f"✅ Uploaded {count} documents to ChromaDB")
print(f"📊 Collection '{collection_name}' now has {collection.count()} total documents")
@@ -355,19 +346,14 @@ class ChromaAdaptor(SkillAdaptor):
"message": f"Uploaded {count} documents to ChromaDB collection '{collection_name}'",
"collection": collection_name,
"count": count,
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None,
}
except Exception as e:
return {
"success": False,
"message": f"Upload failed: {e}"
}
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self,
documents: list[str],
api_key: str = None
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
@@ -380,12 +366,13 @@ class ChromaAdaptor(SkillAdaptor):
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv('OPENAI_API_KEY')
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
@@ -398,14 +385,14 @@ class ChromaAdaptor(SkillAdaptor):
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small" # Cheapest, fastest
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Processed {min(i+batch_size, len(documents))}/{len(documents)}")
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e

View File

@@ -81,7 +81,14 @@ version: {metadata.version}
{content_body}
"""
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file for Claude.

View File

@@ -46,11 +46,7 @@ class FAISSHelpers(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for FAISS ingestion.
@@ -92,9 +88,9 @@ class FAISSHelpers(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to parallel arrays
@@ -121,9 +117,9 @@ class FAISSHelpers(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to parallel arrays
@@ -160,7 +156,7 @@ class FAISSHelpers(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for FAISS.
@@ -193,7 +189,7 @@ class FAISSHelpers(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -86,7 +86,14 @@ See the references directory for complete documentation with examples and best p
# Return plain markdown (NO frontmatter)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into tar.gz file for Gemini.

View File

@@ -29,11 +29,7 @@ class HaystackAdaptor(SkillAdaptor):
DEFAULT_API_ENDPOINT = None # No upload endpoint
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of Haystack Documents.
@@ -73,17 +69,19 @@ class HaystackAdaptor(SkillAdaptor):
content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
documents.append(
{
"content": chunk_text,
"meta": chunk_meta,
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -104,17 +102,19 @@ class HaystackAdaptor(SkillAdaptor):
ref_content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
documents.append(
{
"content": chunk_text,
"meta": chunk_meta,
}
)
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)
@@ -125,7 +125,7 @@ class HaystackAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Haystack.
@@ -159,7 +159,7 @@ class HaystackAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -29,11 +29,7 @@ class LangChainAdaptor(SkillAdaptor):
DEFAULT_API_ENDPOINT = None # No upload endpoint
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of LangChain Documents.
@@ -73,17 +69,14 @@ class LangChainAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to documents
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": chunk_meta
})
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -104,17 +97,14 @@ class LangChainAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to documents
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": chunk_meta
})
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)
@@ -125,7 +115,7 @@ class LangChainAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for LangChain.
@@ -162,7 +152,7 @@ class LangChainAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -42,11 +42,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of LlamaIndex Nodes.
@@ -88,19 +84,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
nodes.append(
{
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -121,19 +119,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
ref_content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
nodes.append(
{
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
}
)
# Return as formatted JSON
return json.dumps(nodes, indent=2, ensure_ascii=False)
@@ -144,7 +144,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for LlamaIndex.
@@ -178,7 +178,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -81,7 +81,14 @@ Browse the reference files for detailed information on each topic. All files are
# Return pure markdown (no frontmatter, no special formatting)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file with markdown documentation.

View File

@@ -103,7 +103,14 @@ Always prioritize accuracy by consulting the attached documentation files before
# Return plain text instructions (NO frontmatter)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file for OpenAI Assistants.

View File

@@ -44,11 +44,7 @@ class QdrantAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="uuid5")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as Qdrant collection JSON.
@@ -87,30 +83,35 @@ class QdrantAdaptor(SkillAdaptor):
content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", "SKILL.md")
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
point_id = self._generate_point_id(
chunk_text,
{
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
)
points.append(
{
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
}
})
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -130,30 +131,35 @@ class QdrantAdaptor(SkillAdaptor):
ref_content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", ref_file.name)
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
point_id = self._generate_point_id(
chunk_text,
{
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
)
points.append(
{
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
}
})
)
# Qdrant configuration
config = {
@@ -184,7 +190,7 @@ class QdrantAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Qdrant.
@@ -217,7 +223,7 @@ class QdrantAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -36,7 +36,7 @@ class StreamingAdaptorMixin:
chunk_size: int = 4000,
chunk_overlap: int = 200,
batch_size: int = 100,
progress_callback: callable | None = None
progress_callback: callable | None = None,
) -> Path:
"""
Package skill using streaming ingestion.
@@ -60,9 +60,7 @@ class StreamingAdaptorMixin:
# Initialize streaming ingester
ingester = StreamingIngester(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size
chunk_size=chunk_size, chunk_overlap=chunk_overlap, batch_size=batch_size
)
print(f"\n📊 Streaming ingestion starting...")
@@ -77,9 +75,11 @@ class StreamingAdaptorMixin:
nonlocal last_update
# Update every 10 chunks
if progress.processed_chunks - last_update >= 10:
print(f" {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
f"({progress.chunks_per_second:.1f} chunks/sec)")
print(
f" {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
f"({progress.chunks_per_second:.1f} chunks/sec)"
)
last_update = progress.processed_chunks
if progress_callback:
@@ -97,10 +97,7 @@ class StreamingAdaptorMixin:
# Convert chunks to platform format
print(f"\n📦 Converting to {self.PLATFORM_NAME} format...")
package_data = self._convert_chunks_to_platform_format(
all_chunks,
skill_dir.name
)
package_data = self._convert_chunks_to_platform_format(all_chunks, skill_dir.name)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
@@ -114,8 +111,7 @@ class StreamingAdaptorMixin:
# Write output
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(package_data, indent=2, ensure_ascii=False),
encoding="utf-8"
json.dumps(package_data, indent=2, ensure_ascii=False), encoding="utf-8"
)
print(f"✅ Package created: {output_path}")
@@ -124,9 +120,7 @@ class StreamingAdaptorMixin:
return output_path
def _convert_chunks_to_platform_format(
self,
chunks: list[tuple[str, dict]],
skill_name: str
self, chunks: list[tuple[str, dict]], skill_name: str
) -> dict:
"""
Convert chunks to platform-specific format.
@@ -156,14 +150,11 @@ class StreamingAdaptorMixin:
"metadatas": metadatas,
"ids": ids,
"total_chunks": len(chunks),
"streaming": True
"streaming": True,
}
def estimate_chunks(
self,
skill_dir: Path,
chunk_size: int = 4000,
chunk_overlap: int = 200
self, skill_dir: Path, chunk_size: int = 4000, chunk_overlap: int = 200
) -> dict[str, Any]:
"""
Estimate chunking for a skill directory.
@@ -179,10 +170,7 @@ class StreamingAdaptorMixin:
Estimation statistics
"""
skill_dir = Path(skill_dir)
StreamingIngester(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
StreamingIngester(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# Count files and estimate chunks
total_docs = 0
@@ -201,11 +189,9 @@ class StreamingAdaptorMixin:
total_chars += char_count
estimated_chunks += chunk_count
file_stats.append({
"file": "SKILL.md",
"chars": char_count,
"estimated_chunks": chunk_count
})
file_stats.append(
{"file": "SKILL.md", "chars": char_count, "estimated_chunks": chunk_count}
)
# Reference files
refs_dir = skill_dir / "references"
@@ -214,17 +200,21 @@ class StreamingAdaptorMixin:
if ref_file.is_file() and not ref_file.name.startswith("."):
content = ref_file.read_text(encoding="utf-8")
char_count = len(content)
chunk_count = max(1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1)
chunk_count = max(
1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1
)
total_docs += 1
total_chars += char_count
estimated_chunks += chunk_count
file_stats.append({
"file": ref_file.name,
"chars": char_count,
"estimated_chunks": chunk_count
})
file_stats.append(
{
"file": ref_file.name,
"chars": char_count,
"estimated_chunks": chunk_count,
}
)
return {
"skill_name": skill_dir.name,
@@ -235,7 +225,7 @@ class StreamingAdaptorMixin:
"chunk_overlap": chunk_overlap,
"file_stats": file_stats,
"estimated_memory_mb": (total_chars * 2) / (1024 * 1024), # UTF-8 estimate
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100,
}
@@ -251,25 +241,27 @@ class StreamingLangChainAdaptor(StreamingAdaptorMixin):
documents = []
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": {
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_id": chunk_meta["chunk_id"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", "1.0.0"),
documents.append(
{
"page_content": chunk_text,
"metadata": {
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_id": chunk_meta["chunk_id"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", "1.0.0"),
},
}
})
)
return {
"documents": documents,
"total_chunks": len(chunks),
"streaming": True,
"format": "LangChain Document"
"format": "LangChain Document",
}
@@ -287,14 +279,16 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append({
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
})
metadatas.append(
{
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
}
)
ids.append(chunk_meta["chunk_id"])
return {
@@ -303,7 +297,7 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
"ids": ids,
"collection_name": skill_name.replace("_", "-"),
"total_chunks": len(chunks),
"streaming": True
"streaming": True,
}
@@ -339,11 +333,7 @@ def demo_streaming():
print("=" * 60)
output = adaptor.package_streaming(
skill_dir,
Path("output"),
chunk_size=2000,
chunk_overlap=100,
batch_size=50
skill_dir, Path("output"), chunk_size=2000, chunk_overlap=100, batch_size=50
)
print(f"\n✅ Complete! Output: {output}")

View File

@@ -104,11 +104,7 @@ class WeaviateAdaptor(SkillAdaptor):
}
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for Weaviate ingestion.
@@ -148,24 +144,26 @@ class WeaviateAdaptor(SkillAdaptor):
content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
})
objects.append(
{
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -186,24 +184,26 @@ class WeaviateAdaptor(SkillAdaptor):
ref_content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
})
objects.append(
{
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
}
)
# Generate schema
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
@@ -222,7 +222,7 @@ class WeaviateAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Weaviate.
@@ -258,7 +258,7 @@ class WeaviateAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file
@@ -310,7 +310,7 @@ class WeaviateAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "weaviate-client not installed. Run: pip install weaviate-client"
"message": "weaviate-client not installed. Run: pip install weaviate-client",
}
# Load package
@@ -319,16 +319,16 @@ class WeaviateAdaptor(SkillAdaptor):
# Connect to Weaviate
try:
if kwargs.get('use_cloud') and api_key:
if kwargs.get("use_cloud") and api_key:
# Weaviate Cloud
print(f"🌐 Connecting to Weaviate Cloud: {kwargs.get('cluster_url')}")
client = weaviate.Client(
url=kwargs.get('cluster_url'),
auth_client_secret=weaviate.AuthApiKey(api_key=api_key)
url=kwargs.get("cluster_url"),
auth_client_secret=weaviate.AuthApiKey(api_key=api_key),
)
else:
# Local Weaviate instance
weaviate_url = kwargs.get('weaviate_url', 'http://localhost:8080')
weaviate_url = kwargs.get("weaviate_url", "http://localhost:8080")
print(f"🌐 Connecting to Weaviate at: {weaviate_url}")
client = weaviate.Client(url=weaviate_url)
@@ -336,69 +336,67 @@ class WeaviateAdaptor(SkillAdaptor):
if not client.is_ready():
return {
"success": False,
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest"
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest",
}
except Exception as e:
return {
"success": False,
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials."
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials.",
}
# Create schema
try:
client.schema.create_class(data['schema'])
client.schema.create_class(data["schema"])
print(f"✅ Created schema: {data['class_name']}")
except Exception as e:
if "already exists" in str(e).lower():
print(f" Schema already exists: {data['class_name']}")
else:
return {
"success": False,
"message": f"Schema creation failed: {e}"
}
return {"success": False, "message": f"Schema creation failed: {e}"}
# Handle embeddings
embedding_function = kwargs.get('embedding_function')
embedding_function = kwargs.get("embedding_function")
try:
with client.batch as batch:
batch.batch_size = 100
if embedding_function == 'openai':
if embedding_function == "openai":
# Generate embeddings with OpenAI
print("🔄 Generating OpenAI embeddings and uploading...")
embeddings = self._generate_openai_embeddings(
[obj['properties']['content'] for obj in data['objects']],
api_key=kwargs.get('openai_api_key')
[obj["properties"]["content"] for obj in data["objects"]],
api_key=kwargs.get("openai_api_key"),
)
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id'],
vector=embeddings[i]
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
elif embedding_function == 'sentence-transformers':
elif embedding_function == "sentence-transformers":
# Use sentence-transformers
print("🔄 Generating sentence-transformer embeddings and uploading...")
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
contents = [obj['properties']['content'] for obj in data['objects']]
model = SentenceTransformer("all-MiniLM-L6-v2")
contents = [obj["properties"]["content"] for obj in data["objects"]]
embeddings = model.encode(contents, show_progress_bar=True).tolist()
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id'],
vector=embeddings[i]
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
if (i + 1) % 100 == 0:
@@ -407,42 +405,37 @@ class WeaviateAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
}
else:
# No embeddings - Weaviate will use its configured vectorizer
print("🔄 Uploading objects (Weaviate will generate embeddings)...")
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id']
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
)
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
count = len(data['objects'])
count = len(data["objects"])
print(f"✅ Upload complete! {count} objects added to Weaviate")
return {
"success": True,
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
"class_name": data['class_name'],
"count": count
"class_name": data["class_name"],
"count": count,
}
except Exception as e:
return {
"success": False,
"message": f"Upload failed: {e}"
}
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self,
documents: list[str],
api_key: str = None
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
@@ -455,12 +448,13 @@ class WeaviateAdaptor(SkillAdaptor):
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv('OPENAI_API_KEY')
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
@@ -473,14 +467,16 @@ class WeaviateAdaptor(SkillAdaptor):
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small" # Cheapest, fastest
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Generated {min(i+batch_size, len(documents))}/{len(documents)} embeddings")
print(
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
)
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e

View File

@@ -101,10 +101,38 @@ class ArchitecturalPatternDetector:
# Web Frameworks
"Django": ["django", "manage.py", "settings.py", "urls.py"],
"Flask": ["flask", "app.py", "wsgi.py"],
"Spring": ["springframework", "org.springframework", "@Controller", "@Service", "@Repository"],
"ASP.NET": ["Microsoft.AspNetCore", "System.Web", "Controllers", "Models", "Views", ".cshtml", "Startup.cs"],
"Rails": ["rails", "action", "app/models", "app/views", "app/controllers", "config/routes.rb"],
"Angular": ["@angular", "angular", "app.module.ts", "@Component", "@Injectable", "angular.json"],
"Spring": [
"springframework",
"org.springframework",
"@Controller",
"@Service",
"@Repository",
],
"ASP.NET": [
"Microsoft.AspNetCore",
"System.Web",
"Controllers",
"Models",
"Views",
".cshtml",
"Startup.cs",
],
"Rails": [
"rails",
"action",
"app/models",
"app/views",
"app/controllers",
"config/routes.rb",
],
"Angular": [
"@angular",
"angular",
"app.module.ts",
"@Component",
"@Injectable",
"angular.json",
],
"React": ["react", "package.json", "components"],
"Vue.js": ["vue", ".vue", "components"],
"Express": ["express", "app.js", "routes"],
@@ -208,7 +236,9 @@ class ArchitecturalPatternDetector:
# Create searchable import string
import_content = " ".join(all_imports)
logger.debug(f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection")
logger.debug(
f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection"
)
# Also check actual directory structure for game engine markers
# (project.godot, .unity, .uproject are config files, not in analyzed files)
@@ -245,7 +275,9 @@ class ArchitecturalPatternDetector:
# Check in file paths, directory structure, AND imports
path_matches = sum(1 for marker in markers if marker.lower() in all_content.lower())
dir_matches = sum(1 for marker in markers if marker.lower() in dir_content.lower())
import_matches = sum(1 for marker in markers if marker.lower() in import_content.lower())
import_matches = sum(
1 for marker in markers if marker.lower() in import_content.lower()
)
# Strategy: Prioritize import-based detection (more accurate)
# If we have import matches, they're strong signals - use them alone
@@ -257,7 +289,9 @@ class ArchitecturalPatternDetector:
elif (path_matches + dir_matches) >= 2:
# Path/directory-based detection (requires 2+ matches)
detected.append(framework)
logger.info(f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})")
logger.info(
f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})"
)
return detected

View File

@@ -77,7 +77,9 @@ def run_embedding_benchmark(runner, config):
with bench.timer("batch_embedding"), bench.memory("batch_embedding"):
embeddings = generator.generate_batch(texts, model=model)
bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec")
bench.metric(
"embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec"
)
name = config.get("name", "embedding-benchmark")
report = runner.run(name, benchmark_func)
@@ -97,7 +99,7 @@ def run_storage_benchmark(runner, config):
storage = get_storage_adaptor(provider, bucket=bucket)
# Create test file
with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
with NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write("Test data" * 1000)
test_file = Path(f.name)
@@ -128,10 +130,7 @@ def compare_command(args):
"""Compare two benchmarks."""
runner = BenchmarkRunner()
comparison = runner.compare(
baseline_path=Path(args.baseline),
current_path=Path(args.current)
)
comparison = runner.compare(baseline_path=Path(args.baseline), current_path=Path(args.current))
print(f"\n📊 Comparison: {comparison.name}\n")
print(f"Overall: {comparison.overall_improvement}\n")
@@ -213,7 +212,7 @@ def cleanup_command(args):
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Performance benchmarking suite',
description="Performance benchmarking suite",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -233,54 +232,46 @@ Examples:
# Cleanup old benchmarks
skill-seekers-benchmark cleanup --keep 5
"""
""",
)
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Run command
run_parser = subparsers.add_parser('run', help='Run benchmark')
run_parser.add_argument('--config', required=True, help='Benchmark config file')
run_parser = subparsers.add_parser("run", help="Run benchmark")
run_parser.add_argument("--config", required=True, help="Benchmark config file")
run_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Output directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Output directory (default: benchmarks)"
)
# Compare command
compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks')
compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark')
compare_parser.add_argument('--current', required=True, help='Current benchmark')
compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks")
compare_parser.add_argument("--baseline", required=True, help="Baseline benchmark")
compare_parser.add_argument("--current", required=True, help="Current benchmark")
compare_parser.add_argument(
'--fail-on-regression',
action='store_true',
help='Exit with error if regressions detected'
"--fail-on-regression", action="store_true", help="Exit with error if regressions detected"
)
# List command
list_parser = subparsers.add_parser('list', help='List saved benchmarks')
list_parser = subparsers.add_parser("list", help="List saved benchmarks")
list_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Benchmark directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
)
# Show command
show_parser = subparsers.add_parser('show', help='Show benchmark details')
show_parser.add_argument('path', help='Path to benchmark file')
show_parser = subparsers.add_parser("show", help="Show benchmark details")
show_parser.add_argument("path", help="Path to benchmark file")
# Cleanup command
cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks')
cleanup_parser = subparsers.add_parser("cleanup", help="Cleanup old benchmarks")
cleanup_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Benchmark directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
)
cleanup_parser.add_argument(
'--keep',
"--keep",
type=int,
default=5,
help='Number of latest benchmarks to keep per name (default: 5)'
help="Number of latest benchmarks to keep per name (default: 5)",
)
args = parser.parse_args()
@@ -290,20 +281,20 @@ Examples:
sys.exit(1)
try:
if args.command == 'run':
if args.command == "run":
run_command(args)
elif args.command == 'compare':
elif args.command == "compare":
compare_command(args)
elif args.command == 'list':
elif args.command == "list":
list_command(args)
elif args.command == 'show':
elif args.command == "show":
show_command(args)
elif args.command == 'cleanup':
elif args.command == "cleanup":
cleanup_command(args)
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -15,18 +15,13 @@ from .storage import get_storage_adaptor
def upload_command(args):
"""Handle upload subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
if Path(args.local_path).is_dir():
print(f"📁 Uploading directory: {args.local_path}")
uploaded_files = adaptor.upload_directory(
args.local_path,
args.remote_path,
exclude_patterns=args.exclude
args.local_path, args.remote_path, exclude_patterns=args.exclude
)
print(f"✅ Uploaded {len(uploaded_files)} files")
if args.verbose:
@@ -41,19 +36,13 @@ def upload_command(args):
def download_command(args):
"""Handle download subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
# Check if remote path is a directory (ends with /)
if args.remote_path.endswith('/'):
if args.remote_path.endswith("/"):
print(f"📁 Downloading directory: {args.remote_path}")
downloaded_files = adaptor.download_directory(
args.remote_path,
args.local_path
)
downloaded_files = adaptor.download_directory(args.remote_path, args.local_path)
print(f"✅ Downloaded {len(downloaded_files)} files")
if args.verbose:
for file_path in downloaded_files:
@@ -67,10 +56,7 @@ def download_command(args):
def list_command(args):
"""Handle list subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"📋 Listing files: {args.prefix or '(root)'}")
@@ -99,15 +85,12 @@ def list_command(args):
def delete_command(args):
"""Handle delete subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
if not args.force:
response = input(f"⚠️ Delete {args.remote_path}? [y/N]: ")
if response.lower() != 'y':
if response.lower() != "y":
print("❌ Deletion cancelled")
return
@@ -119,10 +102,7 @@ def delete_command(args):
def url_command(args):
"""Handle url subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"🔗 Generating signed URL: {args.remote_path}")
@@ -134,10 +114,7 @@ def url_command(args):
def copy_command(args):
"""Handle copy subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"📋 Copying: {args.source_path}{args.dest_path}")
@@ -147,7 +124,7 @@ def copy_command(args):
def format_size(size_bytes: int) -> str:
"""Format file size in human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size_bytes < 1024.0:
return f"{size_bytes:.1f}{unit}"
size_bytes /= 1024.0
@@ -161,11 +138,11 @@ def parse_extra_args(extra: list | None) -> dict:
result = {}
for arg in extra:
if '=' in arg:
key, value = arg.split('=', 1)
result[key.lstrip('-')] = value
if "=" in arg:
key, value = arg.split("=", 1)
result[key.lstrip("-")] = value
else:
result[arg.lstrip('-')] = True
result[arg.lstrip("-")] = True
return result
@@ -173,7 +150,7 @@ def parse_extra_args(extra: list | None) -> dict:
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Cloud storage operations for Skill Seekers',
description="Cloud storage operations for Skill Seekers",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -197,114 +174,66 @@ Provider-specific options:
S3: --region=us-west-2 --endpoint-url=https://...
GCS: --project=my-project --credentials-path=/path/to/creds.json
Azure: --account-name=myaccount --account-key=...
"""
""",
)
# Global arguments
parser.add_argument(
'--provider',
choices=['s3', 'gcs', 'azure'],
required=True,
help='Cloud storage provider'
)
parser.add_argument(
'--bucket',
help='S3/GCS bucket name (for S3/GCS)'
)
parser.add_argument(
'--container',
help='Azure container name (for Azure)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Verbose output'
"--provider", choices=["s3", "gcs", "azure"], required=True, help="Cloud storage provider"
)
parser.add_argument("--bucket", help="S3/GCS bucket name (for S3/GCS)")
parser.add_argument("--container", help="Azure container name (for Azure)")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Upload command
upload_parser = subparsers.add_parser('upload', help='Upload file or directory')
upload_parser.add_argument('local_path', help='Local file or directory path')
upload_parser.add_argument('remote_path', help='Remote path in cloud storage')
upload_parser = subparsers.add_parser("upload", help="Upload file or directory")
upload_parser.add_argument("local_path", help="Local file or directory path")
upload_parser.add_argument("remote_path", help="Remote path in cloud storage")
upload_parser.add_argument(
'--exclude',
action='append',
help='Glob patterns to exclude (for directories)'
)
upload_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--exclude", action="append", help="Glob patterns to exclude (for directories)"
)
upload_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Download command
download_parser = subparsers.add_parser('download', help='Download file or directory')
download_parser.add_argument('remote_path', help='Remote path in cloud storage')
download_parser.add_argument('local_path', help='Local destination path')
download_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
)
download_parser = subparsers.add_parser("download", help="Download file or directory")
download_parser.add_argument("remote_path", help="Remote path in cloud storage")
download_parser.add_argument("local_path", help="Local destination path")
download_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# List command
list_parser = subparsers.add_parser('list', help='List files in cloud storage')
list_parser = subparsers.add_parser("list", help="List files in cloud storage")
list_parser.add_argument("--prefix", default="", help="Prefix to filter files")
list_parser.add_argument(
'--prefix',
default='',
help='Prefix to filter files'
)
list_parser.add_argument(
'--max-results',
type=int,
default=1000,
help='Maximum number of results'
)
list_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--max-results", type=int, default=1000, help="Maximum number of results"
)
list_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Delete command
delete_parser = subparsers.add_parser('delete', help='Delete file from cloud storage')
delete_parser.add_argument('remote_path', help='Remote path in cloud storage')
delete_parser = subparsers.add_parser("delete", help="Delete file from cloud storage")
delete_parser.add_argument("remote_path", help="Remote path in cloud storage")
delete_parser.add_argument(
'--force', '-f',
action='store_true',
help='Skip confirmation prompt'
)
delete_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--force", "-f", action="store_true", help="Skip confirmation prompt"
)
delete_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# URL command
url_parser = subparsers.add_parser('url', help='Generate signed URL')
url_parser.add_argument('remote_path', help='Remote path in cloud storage')
url_parser = subparsers.add_parser("url", help="Generate signed URL")
url_parser.add_argument("remote_path", help="Remote path in cloud storage")
url_parser.add_argument(
'--expires-in',
"--expires-in",
type=int,
default=3600,
help='URL expiration time in seconds (default: 3600)'
)
url_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
help="URL expiration time in seconds (default: 3600)",
)
url_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Copy command
copy_parser = subparsers.add_parser('copy', help='Copy file within cloud storage')
copy_parser.add_argument('source_path', help='Source path')
copy_parser.add_argument('dest_path', help='Destination path')
copy_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
)
copy_parser = subparsers.add_parser("copy", help="Copy file within cloud storage")
copy_parser.add_argument("source_path", help="Source path")
copy_parser.add_argument("dest_path", help="Destination path")
copy_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
args = parser.parse_args()
@@ -313,26 +242,26 @@ Provider-specific options:
sys.exit(1)
# Validate bucket/container based on provider
if args.provider in ['s3', 'gcs'] and not args.bucket:
if args.provider in ["s3", "gcs"] and not args.bucket:
print(f"❌ Error: --bucket is required for {args.provider.upper()}", file=sys.stderr)
sys.exit(1)
elif args.provider == 'azure' and not args.container:
elif args.provider == "azure" and not args.container:
print("❌ Error: --container is required for Azure", file=sys.stderr)
sys.exit(1)
try:
# Execute command
if args.command == 'upload':
if args.command == "upload":
upload_command(args)
elif args.command == 'download':
elif args.command == "download":
download_command(args)
elif args.command == 'list':
elif args.command == "list":
list_command(args)
elif args.command == 'delete':
elif args.command == "delete":
delete_command(args)
elif args.command == 'url':
elif args.command == "url":
url_command(args)
elif args.command == 'copy':
elif args.command == "copy":
copy_command(args)
except FileNotFoundError as e:
@@ -342,9 +271,10 @@ Provider-specific options:
print(f"❌ Error: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -376,8 +376,8 @@ class CodeAnalyzer:
for match in re.finditer(pattern, content):
module = match.group(1)
# Extract package name (before first /)
package = module.split('/')[0]
if package and not package.startswith('.'): # Skip relative imports
package = module.split("/")[0]
if package and not package.startswith("."): # Skip relative imports
imports.append(package)
return {
@@ -694,11 +694,11 @@ class CodeAnalyzer:
for match in re.finditer(using_pattern, content):
namespace = match.group(1).strip()
# Skip using aliases (using Foo = Bar.Baz)
if '=' not in namespace:
if "=" not in namespace:
# Extract base namespace (first 1-2 segments)
parts = namespace.split('.')
parts = namespace.split(".")
if len(parts) >= 2:
base_ns = '.'.join(parts[:2])
base_ns = ".".join(parts[:2])
imports.append(base_ns)
elif len(parts) == 1:
imports.append(parts[0])
@@ -1130,10 +1130,10 @@ class CodeAnalyzer:
for match in re.finditer(import_pattern, content):
import_path = match.group(1).strip()
# Extract package name (first 2-3 segments for framework detection)
parts = import_path.split('.')
parts = import_path.split(".")
if len(parts) >= 2:
# Get base package (e.g., "org.springframework" from "org.springframework.boot.SpringApplication")
package = '.'.join(parts[:2])
package = ".".join(parts[:2])
imports.append(package)
return {
@@ -1303,7 +1303,7 @@ class CodeAnalyzer:
for match in re.finditer(require_pattern, content):
module = match.group(1)
# Extract gem name (before first /)
gem = module.split('/')[0]
gem = module.split("/")[0]
imports.append(gem)
return {
@@ -1443,7 +1443,7 @@ class CodeAnalyzer:
for match in re.finditer(use_pattern, content):
namespace = match.group(1).strip()
# Extract vendor name (first segment)
parts = namespace.split('\\')
parts = namespace.split("\\")
if parts:
vendor = parts[0]
imports.append(vendor.lower())

View File

@@ -1036,11 +1036,15 @@ def analyze_codebase(
# Save summary statistics
summary_json = pattern_output / "summary.json"
with open(summary_json, "w", encoding="utf-8") as f:
json.dump({
"statistics": stats,
"thresholds": multi_level["thresholds"],
"files_analyzed": len(pattern_results),
}, f, indent=2)
json.dump(
{
"statistics": stats,
"thresholds": multi_level["thresholds"],
"files_analyzed": len(pattern_results),
},
f,
indent=2,
)
# Log results with breakdown by confidence
logger.info(f"✅ Detected {stats['total']} patterns in {len(pattern_results)} files")
@@ -1931,21 +1935,15 @@ def _check_deprecated_flags(args):
"⚠️ DEPRECATED: --ai-mode local → use --enhance-level without API key instead"
)
elif args.ai_mode == "none":
warnings.append(
"⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead"
)
warnings.append("⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead")
# Deprecated: --quick flag
if hasattr(args, "quick") and args.quick:
warnings.append(
"⚠️ DEPRECATED: --quick → use --preset quick instead"
)
warnings.append("⚠️ DEPRECATED: --quick → use --preset quick instead")
# Deprecated: --comprehensive flag
if hasattr(args, "comprehensive") and args.comprehensive:
warnings.append(
"⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead"
)
warnings.append("⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead")
# Show warnings if any found
if warnings:
@@ -2000,24 +1998,22 @@ Examples:
parser.add_argument(
"--preset",
choices=["quick", "standard", "comprehensive"],
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
)
parser.add_argument(
"--preset-list",
action="store_true",
help="Show available presets and exit"
"--preset-list", action="store_true", help="Show available presets and exit"
)
# Legacy preset flags (kept for backward compatibility)
parser.add_argument(
"--quick",
action="store_true",
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
)
parser.add_argument(
"--comprehensive",
action="store_true",
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead"
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead",
)
parser.add_argument(
@@ -2129,6 +2125,7 @@ Examples:
# Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation
if "--preset-list" in sys.argv:
from skill_seekers.cli.presets import PresetManager
print(PresetManager.format_preset_help())
return 0
@@ -2155,6 +2152,7 @@ Examples:
# Apply preset using PresetManager
if preset_name:
from skill_seekers.cli.presets import PresetManager
try:
preset_args = PresetManager.apply_preset(preset_name, vars(args))
# Update args with preset values
@@ -2162,9 +2160,7 @@ Examples:
setattr(args, key, value)
preset = PresetManager.get_preset(preset_name)
logger.info(
f"{preset.icon} {preset.name} analysis mode: {preset.description}"
)
logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}")
except ValueError as e:
logger.error(f"{e}")
return 1

View File

@@ -19,6 +19,7 @@ import numpy as np
@dataclass
class EmbeddingConfig:
"""Configuration for embedding generation."""
provider: str # 'openai', 'cohere', 'huggingface', 'local'
model: str
dimension: int
@@ -31,6 +32,7 @@ class EmbeddingConfig:
@dataclass
class EmbeddingResult:
"""Result of embedding generation."""
embeddings: list[list[float]]
metadata: dict[str, Any] = field(default_factory=dict)
cached_count: int = 0
@@ -42,6 +44,7 @@ class EmbeddingResult:
@dataclass
class CostTracker:
"""Track embedding generation costs."""
total_tokens: int = 0
total_requests: int = 0
cache_hits: int = 0
@@ -64,12 +67,12 @@ class CostTracker:
cache_rate = (self.cache_hits / self.total_requests * 100) if self.total_requests > 0 else 0
return {
'total_requests': self.total_requests,
'total_tokens': self.total_tokens,
'cache_hits': self.cache_hits,
'cache_misses': self.cache_misses,
'cache_rate': f"{cache_rate:.1f}%",
'estimated_cost': f"${self.estimated_cost:.4f}"
"total_requests": self.total_requests,
"total_tokens": self.total_tokens,
"cache_hits": self.cache_hits,
"cache_misses": self.cache_misses,
"cache_rate": f"{cache_rate:.1f}%",
"estimated_cost": f"${self.estimated_cost:.4f}",
}
@@ -97,18 +100,18 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
# Pricing per 1M tokens (as of 2026)
PRICING = {
'text-embedding-ada-002': 0.10,
'text-embedding-3-small': 0.02,
'text-embedding-3-large': 0.13,
"text-embedding-ada-002": 0.10,
"text-embedding-3-small": 0.02,
"text-embedding-3-large": 0.13,
}
DIMENSIONS = {
'text-embedding-ada-002': 1536,
'text-embedding-3-small': 1536,
'text-embedding-3-large': 3072,
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
}
def __init__(self, model: str = 'text-embedding-ada-002', api_key: str | None = None):
def __init__(self, model: str = "text-embedding-ada-002", api_key: str | None = None):
"""Initialize OpenAI provider."""
self.model = model
self.api_key = api_key
@@ -119,9 +122,12 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(api_key=self.api_key)
except ImportError:
raise ImportError("OpenAI package not installed. Install with: pip install openai") from None
raise ImportError(
"OpenAI package not installed. Install with: pip install openai"
) from None
return self._client
def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
@@ -130,10 +136,7 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
embeddings = []
for text in texts:
response = client.embeddings.create(
model=self.model,
input=text
)
response = client.embeddings.create(model=self.model, input=text)
embeddings.append(response.data[0].embedding)
return embeddings
@@ -207,7 +210,7 @@ class EmbeddingCache:
if cache_file.exists():
try:
data = json.loads(cache_file.read_text())
embedding = data['embedding']
embedding = data["embedding"]
self._memory_cache[cache_key] = embedding
return embedding
except Exception:
@@ -226,12 +229,16 @@ class EmbeddingCache:
if self.cache_dir:
cache_file = self.cache_dir / f"{cache_key}.json"
try:
cache_file.write_text(json.dumps({
'text_hash': cache_key,
'model': model,
'embedding': embedding,
'timestamp': time.time()
}))
cache_file.write_text(
json.dumps(
{
"text_hash": cache_key,
"model": model,
"embedding": embedding,
"timestamp": time.time(),
}
)
)
except Exception as e:
print(f"⚠️ Warning: Failed to write cache: {e}")
@@ -252,9 +259,9 @@ class EmbeddingPipeline:
def _create_provider(self) -> EmbeddingProvider:
"""Create provider based on config."""
if self.config.provider == 'openai':
if self.config.provider == "openai":
return OpenAIEmbeddingProvider(self.config.model)
elif self.config.provider == 'local':
elif self.config.provider == "local":
return LocalEmbeddingProvider(self.config.dimension)
else:
raise ValueError(f"Unknown provider: {self.config.provider}")
@@ -264,11 +271,7 @@ class EmbeddingPipeline:
# Rough estimate: 1 token ≈ 4 characters
return len(text) // 4
def generate_batch(
self,
texts: list[str],
show_progress: bool = True
) -> EmbeddingResult:
def generate_batch(self, texts: list[str], show_progress: bool = True) -> EmbeddingResult:
"""
Generate embeddings for batch of texts.
@@ -293,7 +296,7 @@ class EmbeddingPipeline:
# Process in batches
for i in range(0, len(texts), self.config.batch_size):
batch = texts[i:i + self.config.batch_size]
batch = texts[i : i + self.config.batch_size]
batch_embeddings = []
to_generate = []
to_generate_indices = []
@@ -331,7 +334,7 @@ class EmbeddingPipeline:
if show_progress and len(texts) > self.config.batch_size:
progress = min(i + self.config.batch_size, len(texts))
print(f" Progress: {progress}/{len(texts)} ({progress/len(texts)*100:.1f}%)")
print(f" Progress: {progress}/{len(texts)} ({progress / len(texts) * 100:.1f}%)")
total_time = time.time() - start_time
@@ -342,21 +345,21 @@ class EmbeddingPipeline:
print(f" Generated: {generated_count}")
print(f" Time: {total_time:.2f}s")
if self.config.provider != 'local':
if self.config.provider != "local":
stats = self.cost_tracker.get_stats()
print(f" Cost: {stats['estimated_cost']}")
return EmbeddingResult(
embeddings=embeddings,
metadata={
'provider': self.config.provider,
'model': self.config.model,
'dimension': self.provider.get_dimension()
"provider": self.config.provider,
"model": self.config.model,
"dimension": self.provider.get_dimension(),
},
cached_count=cached_count,
generated_count=generated_count,
total_time=total_time,
cost_estimate=self.cost_tracker.estimated_cost
cost_estimate=self.cost_tracker.estimated_cost,
)
def validate_dimensions(self, embeddings: list[list[float]]) -> bool:
@@ -373,8 +376,10 @@ class EmbeddingPipeline:
for i, embedding in enumerate(embeddings):
if len(embedding) != expected_dim:
print(f"❌ Dimension mismatch at index {i}: "
f"expected {expected_dim}, got {len(embedding)}")
print(
f"❌ Dimension mismatch at index {i}: "
f"expected {expected_dim}, got {len(embedding)}"
)
return False
return True
@@ -390,11 +395,11 @@ def example_usage():
# Configure pipeline
config = EmbeddingConfig(
provider='local', # Use 'openai' for production
model='text-embedding-ada-002',
provider="local", # Use 'openai' for production
model="text-embedding-ada-002",
dimension=384,
batch_size=50,
cache_dir=Path("output/.embeddings_cache")
cache_dir=Path("output/.embeddings_cache"),
)
# Initialize pipeline

View File

@@ -175,8 +175,7 @@ class LocalSkillEnhancer:
dangerous_chars = [";", "&", "|", "$", "`", "\n", "\r"]
if any(char in cmd_template for char in dangerous_chars):
raise ValueError(
"Custom command contains dangerous shell characters. "
f"Command: {cmd_template}"
f"Custom command contains dangerous shell characters. Command: {cmd_template}"
)
try:
@@ -888,9 +887,7 @@ rm {prompt_file}
print("❌ SKILL.md not found after enhancement")
return False
else:
print(
f"{self.agent_display} returned error (exit code: {result.returncode})"
)
print(f"{self.agent_display} returned error (exit code: {result.returncode})")
if result.stderr:
print(f" Error: {result.stderr[:200]}")
return False

View File

@@ -16,6 +16,7 @@ from datetime import datetime
@dataclass
class DocumentVersion:
"""Version information for a document."""
file_path: str
content_hash: str
size_bytes: int
@@ -26,6 +27,7 @@ class DocumentVersion:
@dataclass
class ChangeSet:
"""Set of changes detected."""
added: list[DocumentVersion]
modified: list[DocumentVersion]
deleted: list[str]
@@ -45,6 +47,7 @@ class ChangeSet:
@dataclass
class UpdateMetadata:
"""Metadata for an incremental update."""
timestamp: str
previous_version: str
new_version: str
@@ -86,7 +89,7 @@ class IncrementalUpdater:
sha256 = hashlib.sha256()
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
while chunk := f.read(8192):
sha256.update(chunk)
return sha256.hexdigest()
@@ -111,7 +114,7 @@ class IncrementalUpdater:
content_hash=self._compute_file_hash(skill_md),
size_bytes=skill_md.stat().st_size,
last_modified=skill_md.stat().st_mtime,
version=1
version=1,
)
# Scan references
@@ -125,7 +128,7 @@ class IncrementalUpdater:
content_hash=self._compute_file_hash(ref_file),
size_bytes=ref_file.stat().st_size,
last_modified=ref_file.stat().st_mtime,
version=1
version=1,
)
return versions
@@ -157,9 +160,8 @@ class IncrementalUpdater:
"timestamp": datetime.now().isoformat(),
"version": "1.0.0",
"documents": {
file_path: asdict(version)
for file_path, version in self.current_versions.items()
}
file_path: asdict(version) for file_path, version in self.current_versions.items()
},
}
self.version_file.write_text(json.dumps(data, indent=2))
@@ -180,10 +182,7 @@ class IncrementalUpdater:
if not has_previous:
# First time - all files are "added"
return ChangeSet(
added=list(self.current_versions.values()),
modified=[],
deleted=[],
unchanged=[]
added=list(self.current_versions.values()), modified=[], deleted=[], unchanged=[]
)
# Detect changes
@@ -215,18 +214,10 @@ class IncrementalUpdater:
else:
unchanged.append(current)
return ChangeSet(
added=added,
modified=modified,
deleted=deleted,
unchanged=unchanged
)
return ChangeSet(added=added, modified=modified, deleted=deleted, unchanged=unchanged)
def generate_update_package(
self,
change_set: ChangeSet,
output_path: Path,
include_content: bool = True
self, change_set: ChangeSet, output_path: Path, include_content: bool = True
) -> Path:
"""
Generate incremental update package.
@@ -250,11 +241,11 @@ class IncrementalUpdater:
"added": len(change_set.added),
"modified": len(change_set.modified),
"deleted": len(change_set.deleted),
"unchanged": len(change_set.unchanged)
"unchanged": len(change_set.unchanged),
},
"total_changes": change_set.total_changes
"total_changes": change_set.total_changes,
},
"changes": {}
"changes": {},
}
# Include changed documents
@@ -267,7 +258,7 @@ class IncrementalUpdater:
"version": doc.version,
"content": file_path.read_text(encoding="utf-8"),
"hash": doc.content_hash,
"size": doc.size_bytes
"size": doc.size_bytes,
}
# Modified documents
@@ -278,14 +269,12 @@ class IncrementalUpdater:
"version": doc.version,
"content": file_path.read_text(encoding="utf-8"),
"hash": doc.content_hash,
"size": doc.size_bytes
"size": doc.size_bytes,
}
# Deleted documents
for file_path in change_set.deleted:
update_data["changes"][file_path] = {
"action": "delete"
}
update_data["changes"][file_path] = {"action": "delete"}
# Write package
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -332,7 +321,9 @@ class IncrementalUpdater:
if prev:
size_diff = doc.size_bytes - prev.size_bytes
size_str = f"{size_diff:+,} bytes" if size_diff != 0 else "same size"
lines.append(f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})")
lines.append(
f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})"
)
else:
lines.append(f" ~ {doc.file_path} (v{doc.version})")
lines.append("")
@@ -473,4 +464,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -369,8 +369,6 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
(r"\$[0-9]+", 4),
(r"->", 3),
],
# ===== Markup/Config Languages =====
"html": [
(r"<!DOCTYPE\s+html>", 5),

View File

@@ -42,25 +42,25 @@ from skill_seekers.cli import __version__
# Command module mapping (command name -> module path)
COMMAND_MODULES = {
'config': 'skill_seekers.cli.config_command',
'scrape': 'skill_seekers.cli.doc_scraper',
'github': 'skill_seekers.cli.github_scraper',
'pdf': 'skill_seekers.cli.pdf_scraper',
'unified': 'skill_seekers.cli.unified_scraper',
'enhance': 'skill_seekers.cli.enhance_skill_local',
'enhance-status': 'skill_seekers.cli.enhance_status',
'package': 'skill_seekers.cli.package_skill',
'upload': 'skill_seekers.cli.upload_skill',
'estimate': 'skill_seekers.cli.estimate_pages',
'extract-test-examples': 'skill_seekers.cli.test_example_extractor',
'install-agent': 'skill_seekers.cli.install_agent',
'analyze': 'skill_seekers.cli.codebase_scraper',
'install': 'skill_seekers.cli.install_skill',
'resume': 'skill_seekers.cli.resume_command',
'stream': 'skill_seekers.cli.streaming_ingest',
'update': 'skill_seekers.cli.incremental_updater',
'multilang': 'skill_seekers.cli.multilang_support',
'quality': 'skill_seekers.cli.quality_metrics',
"config": "skill_seekers.cli.config_command",
"scrape": "skill_seekers.cli.doc_scraper",
"github": "skill_seekers.cli.github_scraper",
"pdf": "skill_seekers.cli.pdf_scraper",
"unified": "skill_seekers.cli.unified_scraper",
"enhance": "skill_seekers.cli.enhance_skill_local",
"enhance-status": "skill_seekers.cli.enhance_status",
"package": "skill_seekers.cli.package_skill",
"upload": "skill_seekers.cli.upload_skill",
"estimate": "skill_seekers.cli.estimate_pages",
"extract-test-examples": "skill_seekers.cli.test_example_extractor",
"install-agent": "skill_seekers.cli.install_agent",
"analyze": "skill_seekers.cli.codebase_scraper",
"install": "skill_seekers.cli.install_skill",
"resume": "skill_seekers.cli.resume_command",
"stream": "skill_seekers.cli.streaming_ingest",
"update": "skill_seekers.cli.incremental_updater",
"multilang": "skill_seekers.cli.multilang_support",
"quality": "skill_seekers.cli.quality_metrics",
}
@@ -124,12 +124,21 @@ def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]:
# Convert args to sys.argv format
for key, value in vars(args).items():
if key == 'command':
if key == "command":
continue
# Handle positional arguments (no -- prefix)
if key in ['url', 'directory', 'file', 'job_id', 'skill_directory', 'zip_file', 'config', 'input_file']:
if value is not None and value != '':
if key in [
"url",
"directory",
"file",
"job_id",
"skill_directory",
"zip_file",
"config",
"input_file",
]:
if value is not None and value != "":
argv.append(str(value))
continue
@@ -172,7 +181,7 @@ def main(argv: list[str] | None = None) -> int:
return 1
# Special handling for 'analyze' command (has post-processing)
if args.command == 'analyze':
if args.command == "analyze":
return _handle_analyze_command(args)
# Standard delegation for all other commands
@@ -200,6 +209,7 @@ def main(argv: list[str] | None = None) -> int:
# Show traceback in verbose mode
import traceback
if hasattr(args, "verbose") and getattr(args, "verbose", False):
traceback.print_exc()
@@ -226,13 +236,16 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
# Handle preset flags (depth and features)
if args.quick:
sys.argv.extend([
"--depth", "surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
])
sys.argv.extend(
[
"--depth",
"surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
]
)
elif args.comprehensive:
sys.argv.extend(["--depth", "full"])
elif args.depth:
@@ -246,6 +259,7 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
elif args.enhance:
try:
from skill_seekers.cli.config_manager import get_config_manager
config = get_config_manager()
enhance_level = config.get_default_enhance_level()
except Exception:

View File

@@ -15,6 +15,7 @@ import json
@dataclass
class LanguageInfo:
"""Language information for a document."""
code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh')
name: str # Full name (e.g., 'English', 'Spanish', 'Chinese')
confidence: float # Detection confidence (0.0-1.0)
@@ -24,6 +25,7 @@ class LanguageInfo:
@dataclass
class TranslationStatus:
"""Translation status for a document."""
source_language: str
target_languages: list[str]
translated_languages: set[str]
@@ -40,74 +42,81 @@ class LanguageDetector:
# Common word patterns by language
LANGUAGE_PATTERNS = {
'en': [
r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b',
r'\b(this|that|these|those|what|which|who|where|when)\b',
"en": [
r"\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b",
r"\b(this|that|these|those|what|which|who|where|when)\b",
],
'es': [
r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b',
r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b',
"es": [
r"\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b",
r"\b(que|no|un|una|como|más|pero|muy|todo|ya)\b",
],
'fr': [
r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b',
r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b',
"fr": [
r"\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b",
r"\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b",
],
'de': [
r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b',
r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b',
"de": [
r"\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b",
r"\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b",
],
'zh': [
r'[\u4e00-\u9fff]', # Chinese characters
r'(的|了|和|是|在|有|我|他|不|这)',
"zh": [
r"[\u4e00-\u9fff]", # Chinese characters
r"(的|了|和|是|在|有|我|他|不|这)",
],
'ja': [
r'[\u3040-\u309f]', # Hiragana
r'[\u30a0-\u30ff]', # Katakana
r'[\u4e00-\u9faf]', # Kanji
"ja": [
r"[\u3040-\u309f]", # Hiragana
r"[\u30a0-\u30ff]", # Katakana
r"[\u4e00-\u9faf]", # Kanji
],
'ko': [
r'[\uac00-\ud7af]', # Hangul
r'(의|가|이|은|들|는|좀|잘|께|을)',
"ko": [
r"[\uac00-\ud7af]", # Hangul
r"(의|가|이|은|들|는|좀|잘|께|을)",
],
'ru': [
r'[\u0400-\u04ff]', # Cyrillic
r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b',
"ru": [
r"[\u0400-\u04ff]", # Cyrillic
r"\b(и|в|не|на|с|что|он|по|а|как|это|все)\b",
],
'pt': [
r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b',
r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b',
"pt": [
r"\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b",
r"\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b",
],
'it': [
r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b',
r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b',
"it": [
r"\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b",
r"\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b",
],
'ar': [
r'[\u0600-\u06ff]', # Arabic
r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)',
"ar": [
r"[\u0600-\u06ff]", # Arabic
r"(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)",
],
}
# Language names
LANGUAGE_NAMES = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'zh': 'Chinese',
'ja': 'Japanese',
'ko': 'Korean',
'ru': 'Russian',
'pt': 'Portuguese',
'it': 'Italian',
'ar': 'Arabic',
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"zh": "Chinese",
"ja": "Japanese",
"ko": "Korean",
"ru": "Russian",
"pt": "Portuguese",
"it": "Italian",
"ar": "Arabic",
}
# Script types
SCRIPTS = {
'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin',
'pt': 'Latin', 'it': 'Latin',
'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul',
'ru': 'Cyrillic', 'ar': 'Arabic',
"en": "Latin",
"es": "Latin",
"fr": "Latin",
"de": "Latin",
"pt": "Latin",
"it": "Latin",
"zh": "Han",
"ja": "Japanese",
"ko": "Hangul",
"ru": "Cyrillic",
"ar": "Arabic",
}
def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo:
@@ -122,7 +131,7 @@ class LanguageDetector:
LanguageInfo with detected language
"""
if not text.strip():
return LanguageInfo('en', 'English', 0.0)
return LanguageInfo("en", "English", 0.0)
# Sample text for efficiency
sample = text[:sample_size].lower()
@@ -140,7 +149,7 @@ class LanguageDetector:
# Find best match
if not scores or max(scores.values()) == 0:
# Default to English
return LanguageInfo('en', 'English', 0.1)
return LanguageInfo("en", "English", 0.1)
best_lang = max(scores, key=scores.get)
total_score = sum(scores.values())
@@ -150,7 +159,7 @@ class LanguageDetector:
code=best_lang,
name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()),
confidence=min(confidence, 1.0),
script=self.SCRIPTS.get(best_lang)
script=self.SCRIPTS.get(best_lang),
)
def detect_from_filename(self, filename: str) -> str | None:
@@ -170,12 +179,12 @@ class LanguageDetector:
ISO 639-1 language code or None
"""
# Pattern: file.en.md
match = re.search(r'\.([a-z]{2})\.md$', filename)
match = re.search(r"\.([a-z]{2})\.md$", filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
# Pattern: file_en.md or file-en.md
match = re.search(r'[_-]([a-z]{2})\.md$', filename)
match = re.search(r"[_-]([a-z]{2})\.md$", filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
@@ -200,7 +209,7 @@ class MultiLanguageManager:
file_path: str,
content: str,
metadata: dict | None = None,
force_language: str | None = None
force_language: str | None = None,
) -> None:
"""
Add document with language detection.
@@ -218,7 +227,7 @@ class MultiLanguageManager:
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=1.0,
script=self.detector.SCRIPTS.get(lang_code)
script=self.detector.SCRIPTS.get(lang_code),
)
else:
# Try filename pattern first
@@ -229,7 +238,7 @@ class MultiLanguageManager:
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=0.95,
script=self.detector.SCRIPTS.get(lang_code)
script=self.detector.SCRIPTS.get(lang_code),
)
else:
# Detect from content
@@ -245,13 +254,13 @@ class MultiLanguageManager:
self.documents[lang_code] = []
doc = {
'file_path': file_path,
'content': content,
'language': lang_info.code,
'language_name': lang_info.name,
'confidence': lang_info.confidence,
'script': lang_info.script,
'metadata': metadata or {}
"file_path": file_path,
"content": content,
"language": lang_info.code,
"language_name": lang_info.name,
"confidence": lang_info.confidence,
"script": lang_info.script,
"metadata": metadata or {},
}
self.documents[lang_code].append(doc)
@@ -284,7 +293,7 @@ class MultiLanguageManager:
Returns:
Translation status summary
"""
base_lang = base_language or self.primary_language or 'en'
base_lang = base_language or self.primary_language or "en"
all_languages = set(self.documents.keys())
base_count = self.get_document_count(base_lang)
@@ -295,7 +304,7 @@ class MultiLanguageManager:
target_languages=[],
translated_languages=set(),
missing_languages=set(),
completeness=0.0
completeness=0.0,
)
# Check which languages have translations
@@ -305,7 +314,7 @@ class MultiLanguageManager:
translated.add(lang)
# Commonly expected languages for completeness
expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'}
expected_languages = {"en", "es", "fr", "de", "zh", "ja"}
missing = expected_languages - all_languages
completeness = len(all_languages) / len(expected_languages)
@@ -315,7 +324,7 @@ class MultiLanguageManager:
target_languages=list(all_languages - {base_lang}),
translated_languages=translated,
missing_languages=missing,
completeness=min(completeness, 1.0)
completeness=min(completeness, 1.0),
)
def export_by_language(self, output_dir: Path) -> dict[str, Path]:
@@ -337,10 +346,10 @@ class MultiLanguageManager:
lang_file = output_dir / f"documents_{lang_code}.json"
export_data = {
'language': lang_code,
'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
'document_count': len(docs),
'documents': docs
"language": lang_code,
"language_name": self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
"document_count": len(docs),
"documents": docs,
}
lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False))
@@ -419,9 +428,7 @@ def main():
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
manager.add_document(
"SKILL.md",
skill_md.read_text(encoding="utf-8"),
{"category": "overview"}
"SKILL.md", skill_md.read_text(encoding="utf-8"), {"category": "overview"}
)
# Load reference files
@@ -429,9 +436,7 @@ def main():
if refs_dir.exists():
for ref_file in refs_dir.glob("*.md"):
manager.add_document(
ref_file.name,
ref_file.read_text(encoding="utf-8"),
{"category": ref_file.stem}
ref_file.name, ref_file.read_text(encoding="utf-8"), {"category": ref_file.stem}
)
# Detect languages
@@ -460,4 +465,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -113,7 +113,15 @@ def package_skill(
output_dir = skill_path.parent
# Auto-enable chunking for RAG platforms
RAG_PLATFORMS = ['langchain', 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant']
RAG_PLATFORMS = [
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
]
if target in RAG_PLATFORMS and not enable_chunking:
print(f" Auto-enabling chunking for {target} platform")
@@ -126,17 +134,19 @@ def package_skill(
if streaming:
print(f" Mode: Streaming (chunk_size={chunk_size}, overlap={chunk_overlap})")
elif enable_chunking:
print(f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})")
print(
f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})"
)
try:
# Use streaming if requested and supported
if streaming and hasattr(adaptor, 'package_streaming'):
if streaming and hasattr(adaptor, "package_streaming"):
package_path = adaptor.package_streaming(
skill_path,
output_dir,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size
batch_size=batch_size,
)
elif streaming:
print("⚠️ Streaming not supported for this platform, using standard packaging")
@@ -145,7 +155,7 @@ def package_skill(
output_dir,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
else:
package_path = adaptor.package(
@@ -153,7 +163,7 @@ def package_skill(
output_dir,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
print(f" Output: {package_path}")
@@ -212,7 +222,19 @@ Examples:
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "haystack", "weaviate", "chroma", "faiss", "qdrant"],
choices=[
"claude",
"gemini",
"openai",
"markdown",
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
],
default="claude",
help="Target LLM platform (default: claude)",
)

View File

@@ -3,6 +3,7 @@
This module registers all subcommand parsers and provides a factory
function to create them.
"""
from .base import SubcommandParser
# Import all parser classes

View File

@@ -1,4 +1,5 @@
"""Analyze subcommand parser."""
from .base import SubcommandParser
@@ -21,26 +22,26 @@ class AnalyzeParser(SubcommandParser):
"""Add analyze-specific arguments."""
parser.add_argument("--directory", required=True, help="Directory to analyze")
parser.add_argument(
"--output", default="output/codebase/", help="Output directory (default: output/codebase/)"
"--output",
default="output/codebase/",
help="Output directory (default: output/codebase/)",
)
# Preset selection (NEW - recommended way)
parser.add_argument(
"--preset",
choices=["quick", "standard", "comprehensive"],
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
)
parser.add_argument(
"--preset-list",
action="store_true",
help="Show available presets and exit"
"--preset-list", action="store_true", help="Show available presets and exit"
)
# Legacy preset flags (kept for backward compatibility)
parser.add_argument(
"--quick",
action="store_true",
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
)
parser.add_argument(
"--comprehensive",
@@ -71,15 +72,9 @@ class AnalyzeParser(SubcommandParser):
help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full",
)
parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs")
parser.add_argument(
"--skip-dependency-graph", action="store_true", help="Skip dep graph"
)
parser.add_argument(
"--skip-patterns", action="store_true", help="Skip pattern detection"
)
parser.add_argument(
"--skip-test-examples", action="store_true", help="Skip test examples"
)
parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph")
parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection")
parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
parser.add_argument(

View File

@@ -1,4 +1,5 @@
"""Base parser class for subcommands."""
from abc import ABC, abstractmethod
import argparse
@@ -48,10 +49,6 @@ class SubcommandParser(ABC):
Returns:
Configured ArgumentParser for this subcommand
"""
parser = subparsers.add_parser(
self.name,
help=self.help,
description=self.description
)
parser = subparsers.add_parser(self.name, help=self.help, description=self.description)
self.add_arguments(parser)
return parser

View File

@@ -1,4 +1,5 @@
"""Config subcommand parser."""
from .base import SubcommandParser
@@ -22,9 +23,7 @@ class ConfigParser(SubcommandParser):
parser.add_argument(
"--github", action="store_true", help="Go directly to GitHub token setup"
)
parser.add_argument(
"--api-keys", action="store_true", help="Go directly to API keys setup"
)
parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
parser.add_argument(
"--show", action="store_true", help="Show current configuration and exit"
)

View File

@@ -1,4 +1,5 @@
"""Enhance subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Enhance-status subcommand parser."""
from .base import SubcommandParser
@@ -20,10 +21,6 @@ class EnhanceStatusParser(SubcommandParser):
def add_arguments(self, parser):
"""Add enhance-status-specific arguments."""
parser.add_argument("skill_directory", help="Skill directory path")
parser.add_argument(
"--watch", "-w", action="store_true", help="Watch in real-time"
)
parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time")
parser.add_argument("--json", action="store_true", help="JSON output")
parser.add_argument(
"--interval", type=int, default=2, help="Watch interval in seconds"
)
parser.add_argument("--interval", type=int, default=2, help="Watch interval in seconds")

View File

@@ -1,4 +1,5 @@
"""Estimate subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""GitHub subcommand parser."""
from .base import SubcommandParser
@@ -24,9 +25,7 @@ class GitHubParser(SubcommandParser):
parser.add_argument("--name", help="Skill name")
parser.add_argument("--description", help="Skill description")
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
parser.add_argument(
"--enhance-local", action="store_true", help="AI enhancement (local)"
)
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance")
parser.add_argument(
"--non-interactive",

View File

@@ -1,4 +1,5 @@
"""Install-agent subcommand parser."""
from .base import SubcommandParser
@@ -19,9 +20,7 @@ class InstallAgentParser(SubcommandParser):
def add_arguments(self, parser):
"""Add install-agent-specific arguments."""
parser.add_argument(
"skill_directory", help="Skill directory path (e.g., output/react/)"
)
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
parser.add_argument(
"--agent",
required=True,

View File

@@ -1,4 +1,5 @@
"""Install subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Multilang subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Package subcommand parser."""
from .base import SubcommandParser
@@ -20,27 +21,72 @@ class PackageParser(SubcommandParser):
def add_arguments(self, parser):
"""Add package-specific arguments."""
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
parser.add_argument("--no-open", action="store_true", help="Don't open output folder after packaging")
parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging")
parser.add_argument(
"--no-open", action="store_true", help="Don't open output folder after packaging"
)
parser.add_argument(
"--skip-quality-check", action="store_true", help="Skip quality checks before packaging"
)
parser.add_argument(
"--target",
choices=[
"claude", "gemini", "openai", "markdown",
"langchain", "llama-index", "haystack",
"weaviate", "chroma", "faiss", "qdrant"
"claude",
"gemini",
"openai",
"markdown",
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
],
default="claude",
help="Target LLM platform (default: claude)",
)
parser.add_argument("--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)")
parser.add_argument(
"--upload",
action="store_true",
help="Automatically upload after packaging (requires platform API key)",
)
# Streaming options
parser.add_argument("--streaming", action="store_true", help="Use streaming ingestion for large docs (memory-efficient)")
parser.add_argument("--chunk-size", type=int, default=4000, help="Maximum characters per chunk (streaming mode, default: 4000)")
parser.add_argument("--chunk-overlap", type=int, default=200, help="Overlap between chunks (streaming mode, default: 200)")
parser.add_argument("--batch-size", type=int, default=100, help="Number of chunks per batch (streaming mode, default: 100)")
parser.add_argument(
"--streaming",
action="store_true",
help="Use streaming ingestion for large docs (memory-efficient)",
)
parser.add_argument(
"--chunk-size",
type=int,
default=4000,
help="Maximum characters per chunk (streaming mode, default: 4000)",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=200,
help="Overlap between chunks (streaming mode, default: 200)",
)
parser.add_argument(
"--batch-size",
type=int,
default=100,
help="Number of chunks per batch (streaming mode, default: 100)",
)
# RAG chunking options
parser.add_argument("--chunk", action="store_true", help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)")
parser.add_argument("--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)")
parser.add_argument("--no-preserve-code", action="store_true", help="Allow code block splitting (default: code blocks preserved)")
parser.add_argument(
"--chunk",
action="store_true",
help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)",
)
parser.add_argument(
"--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)"
)
parser.add_argument(
"--no-preserve-code",
action="store_true",
help="Allow code block splitting (default: code blocks preserved)",
)

View File

@@ -1,4 +1,5 @@
"""PDF subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Quality subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Resume subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Scrape subcommand parser."""
from .base import SubcommandParser
@@ -24,15 +25,16 @@ class ScrapeParser(SubcommandParser):
parser.add_argument("--name", help="Skill name")
parser.add_argument("--description", help="Skill description")
parser.add_argument(
"--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)"
"--max-pages",
type=int,
dest="max_pages",
help="Maximum pages to scrape (override config)",
)
parser.add_argument(
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
)
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
parser.add_argument(
"--enhance-local", action="store_true", help="AI enhancement (local)"
)
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
parser.add_argument(
"--async", dest="async_mode", action="store_true", help="Use async scraping"

View File

@@ -1,4 +1,5 @@
"""Stream subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Extract-test-examples subcommand parser."""
from .base import SubcommandParser
@@ -19,9 +20,7 @@ class TestExamplesParser(SubcommandParser):
def add_arguments(self, parser):
"""Add extract-test-examples-specific arguments."""
parser.add_argument(
"directory", nargs="?", help="Directory containing test files"
)
parser.add_argument("directory", nargs="?", help="Directory containing test files")
parser.add_argument("--file", help="Single test file to analyze")
parser.add_argument(
"--language", help="Filter by programming language (python, javascript, etc.)"
@@ -36,6 +35,4 @@ class TestExamplesParser(SubcommandParser):
"--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)"
)
parser.add_argument("--json", action="store_true", help="Output JSON format")
parser.add_argument(
"--markdown", action="store_true", help="Output Markdown format"
)
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")

View File

@@ -1,4 +1,5 @@
"""Unified subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Update subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Upload subcommand parser."""
from .base import SubcommandParser
@@ -19,7 +20,9 @@ class UploadParser(SubcommandParser):
def add_arguments(self, parser):
"""Add upload-specific arguments."""
parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)")
parser.add_argument(
"package_file", help="Path to skill package file (e.g., output/react.zip)"
)
parser.add_argument(
"--target",
@@ -33,22 +36,34 @@ class UploadParser(SubcommandParser):
# ChromaDB upload options
parser.add_argument(
"--chroma-url",
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
)
parser.add_argument(
"--persist-directory",
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
)
# Embedding options
parser.add_argument(
"--embedding-function",
choices=["openai", "sentence-transformers", "none"],
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
)
parser.add_argument(
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
)
parser.add_argument("--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)")
# Weaviate upload options
parser.add_argument("--weaviate-url", default="http://localhost:8080", help="Weaviate URL (default: http://localhost:8080)")
parser.add_argument("--use-cloud", action="store_true", help="Use Weaviate Cloud (requires --api-key and --cluster-url)")
parser.add_argument("--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)")
parser.add_argument(
"--weaviate-url",
default="http://localhost:8080",
help="Weaviate URL (default: http://localhost:8080)",
)
parser.add_argument(
"--use-cloud",
action="store_true",
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
)
parser.add_argument(
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
)

View File

@@ -30,14 +30,14 @@ logger = logging.getLogger(__name__)
# Confidence thresholds for pattern filtering (Issue #240)
CONFIDENCE_THRESHOLDS = {
'critical': 0.80, # High-confidence patterns for ARCHITECTURE.md
'high': 0.70, # Include in detailed analysis
'medium': 0.60, # Include with warning/context
'low': 0.50, # Minimum detection threshold
"critical": 0.80, # High-confidence patterns for ARCHITECTURE.md
"high": 0.70, # Include in detailed analysis
"medium": 0.60, # Include with warning/context
"low": 0.50, # Minimum detection threshold
}
# Default minimum confidence for pattern detection
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS['low']
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS["low"]
@dataclass
@@ -1697,9 +1697,11 @@ def create_multi_level_report(pattern_results: list[dict]) -> dict:
all_patterns_sorted = sorted(all_patterns, key=lambda p: p.get("confidence", 0.0), reverse=True)
# Filter by confidence levels
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['critical'])
high_confidence = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['high'])
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['medium'])
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["critical"])
high_confidence = filter_patterns_by_confidence(
all_patterns_sorted, CONFIDENCE_THRESHOLDS["high"]
)
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["medium"])
return {
"all_patterns": all_patterns_sorted,

View File

@@ -3,6 +3,7 @@
Provides predefined analysis configurations with clear trade-offs
between speed and comprehensiveness.
"""
from dataclasses import dataclass
@@ -13,6 +14,7 @@ class AnalysisPreset:
Defines a complete analysis configuration including depth,
feature flags, and AI enhancement level.
"""
name: str
description: str
depth: str # surface, deep, full
@@ -29,54 +31,52 @@ PRESETS = {
description="Fast basic analysis (1-2 min, essential features only)",
depth="surface",
features={
"api_reference": True, # ON - Essential for API docs
"api_reference": True, # ON - Essential for API docs
"dependency_graph": False, # OFF - Slow, not critical for quick
"patterns": False, # OFF - Slow pattern detection
"test_examples": False, # OFF - Time-consuming extraction
"how_to_guides": False, # OFF - Requires AI enhancement
"config_patterns": False, # OFF - Not critical for quick scan
"docs": True, # ON - README/docs are essential
"patterns": False, # OFF - Slow pattern detection
"test_examples": False, # OFF - Time-consuming extraction
"how_to_guides": False, # OFF - Requires AI enhancement
"config_patterns": False, # OFF - Not critical for quick scan
"docs": True, # ON - README/docs are essential
},
enhance_level=0, # No AI enhancement (fast)
estimated_time="1-2 minutes",
icon=""
icon="",
),
"standard": AnalysisPreset(
name="Standard",
description="Balanced analysis (5-10 min, core features, DEFAULT)",
depth="deep",
features={
"api_reference": True, # ON - Core feature
"dependency_graph": True, # ON - Valuable insights
"patterns": True, # ON - Design pattern detection
"test_examples": True, # ON - Real usage examples
"how_to_guides": False, # OFF - Requires AI (slow)
"config_patterns": True, # ON - Configuration docs
"docs": True, # ON - Project documentation
"api_reference": True, # ON - Core feature
"dependency_graph": True, # ON - Valuable insights
"patterns": True, # ON - Design pattern detection
"test_examples": True, # ON - Real usage examples
"how_to_guides": False, # OFF - Requires AI (slow)
"config_patterns": True, # ON - Configuration docs
"docs": True, # ON - Project documentation
},
enhance_level=1, # SKILL.md enhancement only
estimated_time="5-10 minutes",
icon="🎯"
icon="🎯",
),
"comprehensive": AnalysisPreset(
name="Comprehensive",
description="Full analysis (20-60 min, all features + AI)",
depth="full",
features={
"api_reference": True, # ON - Complete API docs
"dependency_graph": True, # ON - Full dependency analysis
"patterns": True, # ON - All design patterns
"test_examples": True, # ON - All test examples
"how_to_guides": True, # ON - AI-generated guides
"config_patterns": True, # ON - All configuration patterns
"docs": True, # ON - All project docs
"api_reference": True, # ON - Complete API docs
"dependency_graph": True, # ON - Full dependency analysis
"patterns": True, # ON - All design patterns
"test_examples": True, # ON - All test examples
"how_to_guides": True, # ON - AI-generated guides
"config_patterns": True, # ON - All configuration patterns
"docs": True, # ON - All project docs
},
enhance_level=3, # Full AI enhancement (all features)
estimated_time="20-60 minutes",
icon="🚀"
)
icon="🚀",
),
}
@@ -142,10 +142,7 @@ class PresetManager:
raise ValueError(f"Unknown preset: {preset_name}")
# Start with preset defaults
updated_args = {
'depth': preset.depth,
'enhance_level': preset.enhance_level
}
updated_args = {"depth": preset.depth, "enhance_level": preset.enhance_level}
# Convert feature flags to skip_* arguments
# feature=False → skip_feature=True (disabled)

View File

@@ -16,6 +16,7 @@ from enum import Enum
class MetricLevel(Enum):
"""Metric severity level."""
INFO = "info"
WARNING = "warning"
ERROR = "error"
@@ -25,6 +26,7 @@ class MetricLevel(Enum):
@dataclass
class QualityMetric:
"""Individual quality metric."""
name: str
value: float # 0.0-1.0 (or 0-100 percentage)
level: MetricLevel
@@ -35,6 +37,7 @@ class QualityMetric:
@dataclass
class QualityScore:
"""Overall quality score."""
total_score: float # 0-100
completeness: float # 0-100
accuracy: float # 0-100
@@ -46,6 +49,7 @@ class QualityScore:
@dataclass
class QualityReport:
"""Complete quality report."""
timestamp: str
skill_name: str
overall_score: QualityScore
@@ -64,10 +68,17 @@ class QualityAnalyzer:
# Thresholds for quality grades
GRADE_THRESHOLDS = {
'A+': 95, 'A': 90, 'A-': 85,
'B+': 80, 'B': 75, 'B-': 70,
'C+': 65, 'C': 60, 'C-': 55,
'D': 50, 'F': 0
"A+": 95,
"A": 90,
"A-": 85,
"B+": 80,
"B": 75,
"B-": 70,
"C+": 65,
"C": 60,
"C-": 55,
"D": 50,
"F": 0,
}
def __init__(self, skill_dir: Path):
@@ -102,7 +113,7 @@ class QualityAnalyzer:
score += 10
# Has sections (10 points)
if content.count('#') >= 5:
if content.count("#") >= 5:
score += 10
# References directory (20 points)
@@ -134,13 +145,15 @@ class QualityAnalyzer:
if len(suggestions) == 0:
suggestions.append("Expand documentation coverage")
self.metrics.append(QualityMetric(
name="Completeness",
value=completeness,
level=level,
description=f"Documentation completeness: {completeness:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Completeness",
value=completeness,
level=level,
description=f"Documentation completeness: {completeness:.1f}%",
suggestions=suggestions,
)
)
return completeness
@@ -166,14 +179,14 @@ class QualityAnalyzer:
content = skill_md.read_text(encoding="utf-8")
# Check for TODO markers (deduct 5 points each, max 20)
todo_count = content.lower().count('todo')
todo_count = content.lower().count("todo")
if todo_count > 0:
deduction = min(todo_count * 5, 20)
score -= deduction
issues.append(f"Found {todo_count} TODO markers")
# Check for placeholder text (deduct 10)
placeholders = ['lorem ipsum', 'placeholder', 'coming soon']
placeholders = ["lorem ipsum", "placeholder", "coming soon"]
for placeholder in placeholders:
if placeholder in content.lower():
score -= 10
@@ -195,13 +208,15 @@ class QualityAnalyzer:
if accuracy < 100 and issues:
suggestions.extend(issues[:3]) # Top 3 issues
self.metrics.append(QualityMetric(
name="Accuracy",
value=accuracy,
level=level,
description=f"Documentation accuracy: {accuracy:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Accuracy",
value=accuracy,
level=level,
description=f"Documentation accuracy: {accuracy:.1f}%",
suggestions=suggestions,
)
)
return accuracy
@@ -234,13 +249,13 @@ class QualityAnalyzer:
# Check for specific types (20 points each)
ref_names = [f.stem.lower() for f in ref_files]
if any('getting' in name or 'start' in name for name in ref_names):
if any("getting" in name or "start" in name for name in ref_names):
score += 20
if any('api' in name or 'reference' in name for name in ref_names):
if any("api" in name or "reference" in name for name in ref_names):
score += 20
if any('example' in name or 'tutorial' in name for name in ref_names):
if any("example" in name or "tutorial" in name for name in ref_names):
score += 20
# Has diverse content (10 points)
@@ -258,13 +273,15 @@ class QualityAnalyzer:
suggestions.append("Add API reference documentation")
suggestions.append("Expand documentation coverage")
self.metrics.append(QualityMetric(
name="Coverage",
value=coverage,
level=level,
description=f"Documentation coverage: {coverage:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Coverage",
value=coverage,
level=level,
description=f"Documentation coverage: {coverage:.1f}%",
suggestions=suggestions,
)
)
return coverage
@@ -308,56 +325,54 @@ class QualityAnalyzer:
if health < 100:
suggestions.extend(issues[:3])
self.metrics.append(QualityMetric(
name="Health",
value=health,
level=level,
description=f"Skill health: {health:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Health",
value=health,
level=level,
description=f"Skill health: {health:.1f}%",
suggestions=suggestions,
)
)
return health
def calculate_statistics(self) -> dict[str, Any]:
"""Calculate skill statistics."""
stats = {
'total_files': 0,
'total_size_bytes': 0,
'markdown_files': 0,
'reference_files': 0,
'total_characters': 0,
'total_words': 0
"total_files": 0,
"total_size_bytes": 0,
"markdown_files": 0,
"reference_files": 0,
"total_characters": 0,
"total_words": 0,
}
# Count files and sizes
for md_file in self.skill_dir.rglob("*.md"):
stats['total_files'] += 1
stats['markdown_files'] += 1
stats["total_files"] += 1
stats["markdown_files"] += 1
size = md_file.stat().st_size
stats['total_size_bytes'] += size
stats["total_size_bytes"] += size
# Count words
try:
content = md_file.read_text(encoding="utf-8")
stats['total_characters'] += len(content)
stats['total_words'] += len(content.split())
stats["total_characters"] += len(content)
stats["total_words"] += len(content.split())
except Exception:
pass
# Count references
refs_dir = self.skill_dir / "references"
if refs_dir.exists():
stats['reference_files'] = len(list(refs_dir.glob("*.md")))
stats["reference_files"] = len(list(refs_dir.glob("*.md")))
self.statistics = stats
return stats
def calculate_overall_score(
self,
completeness: float,
accuracy: float,
coverage: float,
health: float
self, completeness: float, accuracy: float, coverage: float, health: float
) -> QualityScore:
"""
Calculate overall quality score.
@@ -368,15 +383,10 @@ class QualityAnalyzer:
- Coverage: 25%
- Health: 20%
"""
total = (
completeness * 0.30 +
accuracy * 0.25 +
coverage * 0.25 +
health * 0.20
)
total = completeness * 0.30 + accuracy * 0.25 + coverage * 0.25 + health * 0.20
# Determine grade
grade = 'F'
grade = "F"
for g, threshold in self.GRADE_THRESHOLDS.items():
if total >= threshold:
grade = g
@@ -388,7 +398,7 @@ class QualityAnalyzer:
accuracy=accuracy,
coverage=coverage,
health=health,
grade=grade
grade=grade,
)
def generate_recommendations(self, score: QualityScore) -> list[str]:
@@ -431,9 +441,7 @@ class QualityAnalyzer:
health = self.analyze_health()
# Calculate overall score
overall_score = self.calculate_overall_score(
completeness, accuracy, coverage, health
)
overall_score = self.calculate_overall_score(completeness, accuracy, coverage, health)
# Calculate statistics
stats = self.calculate_statistics()
@@ -447,7 +455,7 @@ class QualityAnalyzer:
overall_score=overall_score,
metrics=self.metrics,
statistics=stats,
recommendations=recommendations
recommendations=recommendations,
)
def format_report(self, report: QualityReport) -> str:
@@ -484,7 +492,7 @@ class QualityAnalyzer:
MetricLevel.INFO: "",
MetricLevel.WARNING: "⚠️",
MetricLevel.ERROR: "",
MetricLevel.CRITICAL: "🔴"
MetricLevel.CRITICAL: "🔴",
}.get(metric.level, "")
lines.append(f" {icon} {metric.name}: {metric.value:.1f}%")
@@ -553,4 +561,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -75,10 +75,7 @@ class RAGChunker:
return len(text) // self.chars_per_token
def chunk_document(
self,
text: str,
metadata: dict,
source_file: str | None = None
self, text: str, metadata: dict, source_file: str | None = None
) -> list[dict]:
"""
Chunk single document into RAG-ready chunks.
@@ -125,11 +122,13 @@ class RAGChunker:
if source_file:
chunk_metadata["source_file"] = source_file
result.append({
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata
})
result.append(
{
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata,
}
)
logger.info(
f"Created {len(result)} chunks from {source_file or 'document'} "
@@ -153,14 +152,10 @@ class RAGChunker:
# Chunk main SKILL.md
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
with open(skill_md, encoding='utf-8') as f:
with open(skill_md, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": "overview",
"file_type": "skill_md"
}
metadata = {"source": skill_dir.name, "category": "overview", "file_type": "skill_md"}
chunks = self.chunk_document(content, metadata, source_file="SKILL.md")
all_chunks.extend(chunks)
@@ -169,26 +164,21 @@ class RAGChunker:
references_dir = skill_dir / "references"
if references_dir.exists():
for ref_file in references_dir.glob("*.md"):
with open(ref_file, encoding='utf-8') as f:
with open(ref_file, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": ref_file.stem,
"file_type": "reference"
"file_type": "reference",
}
chunks = self.chunk_document(
content,
metadata,
source_file=str(ref_file.relative_to(skill_dir))
content, metadata, source_file=str(ref_file.relative_to(skill_dir))
)
all_chunks.extend(chunks)
logger.info(
f"Chunked skill directory {skill_dir.name}: "
f"{len(all_chunks)} total chunks"
)
logger.info(f"Chunked skill directory {skill_dir.name}: {len(all_chunks)} total chunks")
return all_chunks
@@ -207,32 +197,25 @@ class RAGChunker:
# Match code blocks (``` fenced blocks)
# Use DOTALL flag to match across newlines
code_block_pattern = r'```[^\n]*\n.*?```'
code_block_pattern = r"```[^\n]*\n.*?```"
def replacer(match):
idx = len(code_blocks)
code_blocks.append({
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end()
})
code_blocks.append(
{
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end(),
}
)
return placeholder_pattern.format(idx=idx)
text_with_placeholders = re.sub(
code_block_pattern,
replacer,
text,
flags=re.DOTALL
)
text_with_placeholders = re.sub(code_block_pattern, replacer, text, flags=re.DOTALL)
return text_with_placeholders, code_blocks
def _reinsert_code_blocks(
self,
chunks: list[str],
code_blocks: list[dict]
) -> list[str]:
def _reinsert_code_blocks(self, chunks: list[str], code_blocks: list[dict]) -> list[str]:
"""
Re-insert code blocks into chunks.
@@ -249,7 +232,7 @@ class RAGChunker:
for block in code_blocks:
placeholder = f"<<CODE_BLOCK_{block['index']}>>"
if placeholder in chunk:
chunk = chunk.replace(placeholder, block['content'])
chunk = chunk.replace(placeholder, block["content"])
result.append(chunk)
return result
@@ -268,15 +251,15 @@ class RAGChunker:
# Paragraph boundaries (double newline)
if self.preserve_paragraphs:
for match in re.finditer(r'\n\n+', text):
for match in re.finditer(r"\n\n+", text):
boundaries.append(match.end())
# Section headers (# Header)
for match in re.finditer(r'\n#{1,6}\s+.+\n', text):
for match in re.finditer(r"\n#{1,6}\s+.+\n", text):
boundaries.append(match.start())
# Single newlines (less preferred, but useful)
for match in re.finditer(r'\n', text):
for match in re.finditer(r"\n", text):
boundaries.append(match.start())
# Add artificial boundaries for large documents
@@ -352,7 +335,9 @@ class RAGChunker:
# Add chunk if it meets minimum size requirement
# (unless the entire text is smaller than target size)
if chunk_text.strip() and (len(text) <= target_size_chars or len(chunk_text) >= min_size_chars):
if chunk_text.strip() and (
len(text) <= target_size_chars or len(chunk_text) >= min_size_chars
):
chunks.append(chunk_text)
# Move to next chunk with overlap
@@ -383,7 +368,7 @@ class RAGChunker:
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(chunks, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(chunks)} chunks to {output_path}")
@@ -393,7 +378,9 @@ def main():
"""CLI entry point for testing RAG chunker."""
import argparse
parser = argparse.ArgumentParser(description="RAG Chunker - Semantic chunking for RAG pipelines")
parser = argparse.ArgumentParser(
description="RAG Chunker - Semantic chunking for RAG pipelines"
)
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")

View File

@@ -59,27 +59,26 @@ def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
account_name='myaccount')
"""
adaptors = {
's3': S3StorageAdaptor,
'gcs': GCSStorageAdaptor,
'azure': AzureStorageAdaptor,
"s3": S3StorageAdaptor,
"gcs": GCSStorageAdaptor,
"azure": AzureStorageAdaptor,
}
provider_lower = provider.lower()
if provider_lower not in adaptors:
supported = ', '.join(adaptors.keys())
supported = ", ".join(adaptors.keys())
raise ValueError(
f"Unsupported storage provider: {provider}. "
f"Supported providers: {supported}"
f"Unsupported storage provider: {provider}. Supported providers: {supported}"
)
return adaptors[provider_lower](**kwargs)
__all__ = [
'BaseStorageAdaptor',
'StorageObject',
'S3StorageAdaptor',
'GCSStorageAdaptor',
'AzureStorageAdaptor',
'get_storage_adaptor',
"BaseStorageAdaptor",
"StorageObject",
"S3StorageAdaptor",
"GCSStorageAdaptor",
"AzureStorageAdaptor",
"get_storage_adaptor",
]

View File

@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
try:
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
from azure.core.exceptions import ResourceNotFoundError
AZURE_AVAILABLE = True
except ImportError:
AZURE_AVAILABLE = False
@@ -65,38 +66,30 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
"Install with: pip install azure-storage-blob"
)
if 'container' not in kwargs:
if "container" not in kwargs:
raise ValueError("container parameter is required for Azure storage")
self.container_name = kwargs['container']
self.container_name = kwargs["container"]
# Initialize BlobServiceClient
if 'connection_string' in kwargs:
connection_string = kwargs['connection_string']
if "connection_string" in kwargs:
connection_string = kwargs["connection_string"]
else:
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
if connection_string:
self.blob_service_client = BlobServiceClient.from_connection_string(
connection_string
)
self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# Extract account name from connection string
self.account_name = None
self.account_key = None
for part in connection_string.split(';'):
if part.startswith('AccountName='):
self.account_name = part.split('=', 1)[1]
elif part.startswith('AccountKey='):
self.account_key = part.split('=', 1)[1]
for part in connection_string.split(";"):
if part.startswith("AccountName="):
self.account_name = part.split("=", 1)[1]
elif part.startswith("AccountKey="):
self.account_key = part.split("=", 1)[1]
else:
account_name = kwargs.get(
'account_name',
os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
)
account_key = kwargs.get(
'account_key',
os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
)
account_name = kwargs.get("account_name", os.getenv("AZURE_STORAGE_ACCOUNT_NAME"))
account_key = kwargs.get("account_key", os.getenv("AZURE_STORAGE_ACCOUNT_KEY"))
if not account_name or not account_key:
raise ValueError(
@@ -108,13 +101,10 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
self.account_key = account_key
account_url = f"https://{account_name}.blob.core.windows.net"
self.blob_service_client = BlobServiceClient(
account_url=account_url,
credential=account_key
account_url=account_url, credential=account_key
)
self.container_client = self.blob_service_client.get_container_client(
self.container_name
)
self.container_client = self.blob_service_client.get_container_client(self.container_name)
def upload_file(
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
@@ -128,11 +118,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
blob_client = self.container_client.get_blob_client(remote_path)
with open(local_file, "rb") as data:
blob_client.upload_blob(
data,
overwrite=True,
metadata=metadata
)
blob_client.upload_blob(data, overwrite=True, metadata=metadata)
return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
except Exception as e:
@@ -164,25 +150,26 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
except Exception as e:
raise Exception(f"Azure deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in Azure container."""
try:
blobs = self.container_client.list_blobs(
name_starts_with=prefix,
results_per_page=max_results
name_starts_with=prefix, results_per_page=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
etag=blob.etag,
metadata=blob.metadata
))
files.append(
StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.last_modified.isoformat()
if blob.last_modified
else None,
etag=blob.etag,
metadata=blob.metadata,
)
)
return files
except Exception as e:
@@ -205,9 +192,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
raise FileNotFoundError(f"Remote file not found: {remote_path}")
if not self.account_name or not self.account_key:
raise ValueError(
"Account name and key are required for SAS URL generation"
)
raise ValueError("Account name and key are required for SAS URL generation")
sas_token = generate_blob_sas(
account_name=self.account_name,
@@ -215,7 +200,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
blob_name=remote_path,
account_key=self.account_key,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(seconds=expires_in)
expiry=datetime.utcnow() + timedelta(seconds=expires_in),
)
return f"{blob_client.url}?{sas_token}"
@@ -239,12 +224,13 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
# Wait for copy to complete
properties = dest_blob.get_blob_properties()
while properties.copy.status == 'pending':
while properties.copy.status == "pending":
import time
time.sleep(0.1)
properties = dest_blob.get_blob_properties()
if properties.copy.status != 'success':
if properties.copy.status != "success":
raise Exception(f"Copy failed with status: {properties.copy.status}")
except FileNotFoundError:

View File

@@ -95,9 +95,7 @@ class BaseStorageAdaptor(ABC):
pass
@abstractmethod
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""
List files in cloud storage.
@@ -191,9 +189,7 @@ class BaseStorageAdaptor(ABC):
return uploaded_files
def download_directory(
self, remote_prefix: str, local_dir: str
) -> list[str]:
def download_directory(self, remote_prefix: str, local_dir: str) -> list[str]:
"""
Download directory from cloud storage.
@@ -245,9 +241,7 @@ class BaseStorageAdaptor(ABC):
raise FileNotFoundError(f"File not found: {remote_path}")
return files[0].size
def copy_file(
self, source_path: str, dest_path: str
) -> None:
def copy_file(self, source_path: str, dest_path: str) -> None:
"""
Copy file within cloud storage.

View File

@@ -9,6 +9,7 @@ from datetime import timedelta
try:
from google.cloud import storage
from google.cloud.exceptions import NotFound
GCS_AVAILABLE = True
except ImportError:
GCS_AVAILABLE = False
@@ -63,19 +64,19 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
"Install with: pip install google-cloud-storage"
)
if 'bucket' not in kwargs:
if "bucket" not in kwargs:
raise ValueError("bucket parameter is required for GCS storage")
self.bucket_name = kwargs['bucket']
self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
self.bucket_name = kwargs["bucket"]
self.project = kwargs.get("project", os.getenv("GOOGLE_CLOUD_PROJECT"))
# Initialize GCS client
client_kwargs = {}
if self.project:
client_kwargs['project'] = self.project
client_kwargs["project"] = self.project
if 'credentials_path' in kwargs:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
if "credentials_path" in kwargs:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = kwargs["credentials_path"]
self.storage_client = storage.Client(**client_kwargs)
self.bucket = self.storage_client.bucket(self.bucket_name)
@@ -122,26 +123,24 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
except Exception as e:
raise Exception(f"GCS deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in GCS bucket."""
try:
blobs = self.storage_client.list_blobs(
self.bucket_name,
prefix=prefix,
max_results=max_results
self.bucket_name, prefix=prefix, max_results=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.updated.isoformat() if blob.updated else None,
etag=blob.etag,
metadata=blob.metadata
))
files.append(
StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.updated.isoformat() if blob.updated else None,
etag=blob.etag,
metadata=blob.metadata,
)
)
return files
except Exception as e:
@@ -164,9 +163,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
raise FileNotFoundError(f"Remote file not found: {remote_path}")
url = blob.generate_signed_url(
version="v4",
expiration=timedelta(seconds=expires_in),
method="GET"
version="v4", expiration=timedelta(seconds=expires_in), method="GET"
)
return url
except FileNotFoundError:
@@ -182,11 +179,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
if not source_blob.exists():
raise FileNotFoundError(f"Source file not found: {source_path}")
self.bucket.copy_blob(
source_blob,
self.bucket,
dest_path
)
self.bucket.copy_blob(source_blob, self.bucket, dest_path)
except FileNotFoundError:
raise
except Exception as e:

View File

@@ -8,6 +8,7 @@ from pathlib import Path
try:
import boto3
from botocore.exceptions import ClientError
BOTO3_AVAILABLE = True
except ImportError:
BOTO3_AVAILABLE = False
@@ -63,33 +64,30 @@ class S3StorageAdaptor(BaseStorageAdaptor):
super().__init__(**kwargs)
if not BOTO3_AVAILABLE:
raise ImportError(
"boto3 is required for S3 storage. "
"Install with: pip install boto3"
)
raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3")
if 'bucket' not in kwargs:
if "bucket" not in kwargs:
raise ValueError("bucket parameter is required for S3 storage")
self.bucket = kwargs['bucket']
self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
self.bucket = kwargs["bucket"]
self.region = kwargs.get("region", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
# Initialize S3 client
client_kwargs = {
'region_name': self.region,
"region_name": self.region,
}
if 'endpoint_url' in kwargs:
client_kwargs['endpoint_url'] = kwargs['endpoint_url']
if "endpoint_url" in kwargs:
client_kwargs["endpoint_url"] = kwargs["endpoint_url"]
if 'aws_access_key_id' in kwargs:
client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
if "aws_access_key_id" in kwargs:
client_kwargs["aws_access_key_id"] = kwargs["aws_access_key_id"]
if 'aws_secret_access_key' in kwargs:
client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
if "aws_secret_access_key" in kwargs:
client_kwargs["aws_secret_access_key"] = kwargs["aws_secret_access_key"]
self.s3_client = boto3.client('s3', **client_kwargs)
self.s3_resource = boto3.resource('s3', **client_kwargs)
self.s3_client = boto3.client("s3", **client_kwargs)
self.s3_resource = boto3.resource("s3", **client_kwargs)
def upload_file(
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
@@ -101,14 +99,14 @@ class S3StorageAdaptor(BaseStorageAdaptor):
extra_args = {}
if metadata:
extra_args['Metadata'] = metadata
extra_args["Metadata"] = metadata
try:
self.s3_client.upload_file(
str(local_file),
self.bucket,
remote_path,
ExtraArgs=extra_args if extra_args else None
ExtraArgs=extra_args if extra_args else None,
)
return f"s3://{self.bucket}/{remote_path}"
except ClientError as e:
@@ -120,50 +118,41 @@ class S3StorageAdaptor(BaseStorageAdaptor):
local_file.parent.mkdir(parents=True, exist_ok=True)
try:
self.s3_client.download_file(
self.bucket,
remote_path,
str(local_file)
)
self.s3_client.download_file(self.bucket, remote_path, str(local_file))
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
raise FileNotFoundError(f"Remote file not found: {remote_path}") from e
raise Exception(f"S3 download failed: {e}") from e
def delete_file(self, remote_path: str) -> None:
"""Delete file from S3."""
try:
self.s3_client.delete_object(
Bucket=self.bucket,
Key=remote_path
)
self.s3_client.delete_object(Bucket=self.bucket, Key=remote_path)
except ClientError as e:
raise Exception(f"S3 deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in S3 bucket."""
try:
paginator = self.s3_client.get_paginator('list_objects_v2')
paginator = self.s3_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(
Bucket=self.bucket,
Prefix=prefix,
PaginationConfig={'MaxItems': max_results}
Bucket=self.bucket, Prefix=prefix, PaginationConfig={"MaxItems": max_results}
)
files = []
for page in page_iterator:
if 'Contents' not in page:
if "Contents" not in page:
continue
for obj in page['Contents']:
files.append(StorageObject(
key=obj['Key'],
size=obj['Size'],
last_modified=obj['LastModified'].isoformat(),
etag=obj.get('ETag', '').strip('"')
))
for obj in page["Contents"]:
files.append(
StorageObject(
key=obj["Key"],
size=obj["Size"],
last_modified=obj["LastModified"].isoformat(),
etag=obj.get("ETag", "").strip('"'),
)
)
return files
except ClientError as e:
@@ -172,13 +161,10 @@ class S3StorageAdaptor(BaseStorageAdaptor):
def file_exists(self, remote_path: str) -> bool:
"""Check if file exists in S3."""
try:
self.s3_client.head_object(
Bucket=self.bucket,
Key=remote_path
)
self.s3_client.head_object(Bucket=self.bucket, Key=remote_path)
return True
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
return False
raise Exception(f"S3 head_object failed: {e}") from e
@@ -186,12 +172,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
"""Generate presigned URL for S3 object."""
try:
url = self.s3_client.generate_presigned_url(
'get_object',
Params={
'Bucket': self.bucket,
'Key': remote_path
},
ExpiresIn=expires_in
"get_object",
Params={"Bucket": self.bucket, "Key": remote_path},
ExpiresIn=expires_in,
)
return url
except ClientError as e:
@@ -200,16 +183,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
def copy_file(self, source_path: str, dest_path: str) -> None:
"""Copy file within S3 bucket (server-side copy)."""
try:
copy_source = {
'Bucket': self.bucket,
'Key': source_path
}
self.s3_client.copy_object(
CopySource=copy_source,
Bucket=self.bucket,
Key=dest_path
)
copy_source = {"Bucket": self.bucket, "Key": source_path}
self.s3_client.copy_object(CopySource=copy_source, Bucket=self.bucket, Key=dest_path)
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
raise FileNotFoundError(f"Source file not found: {source_path}") from e
raise Exception(f"S3 copy failed: {e}") from e

View File

@@ -17,6 +17,7 @@ import time
@dataclass
class ChunkMetadata:
"""Metadata for a document chunk."""
chunk_id: str
source: str
category: str
@@ -30,6 +31,7 @@ class ChunkMetadata:
@dataclass
class IngestionProgress:
"""Progress tracking for streaming ingestion."""
total_documents: int
processed_documents: int
total_chunks: int
@@ -81,7 +83,7 @@ class StreamingIngester:
chunk_size: int = 4000,
chunk_overlap: int = 200,
batch_size: int = 100,
max_memory_mb: int = 500
max_memory_mb: int = 500,
):
"""
Initialize streaming ingester.
@@ -103,7 +105,7 @@ class StreamingIngester:
content: str,
metadata: dict,
chunk_size: int | None = None,
chunk_overlap: int | None = None
chunk_overlap: int | None = None,
) -> Iterator[tuple[str, ChunkMetadata]]:
"""
Split document into overlapping chunks.
@@ -130,7 +132,7 @@ class StreamingIngester:
chunk_index=0,
total_chunks=1,
char_start=0,
char_end=len(content)
char_end=len(content),
)
yield content, chunk_meta
return
@@ -162,7 +164,7 @@ class StreamingIngester:
chunk_index=i,
total_chunks=total_chunks,
char_start=start,
char_end=end
char_end=end,
)
yield chunk_text, chunk_meta
@@ -170,17 +172,12 @@ class StreamingIngester:
def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str:
"""Generate deterministic chunk ID."""
id_string = (
f"{metadata.get('source', '')}-"
f"{metadata.get('file', '')}-"
f"{chunk_index}-"
f"{content[:50]}"
f"{metadata.get('source', '')}-{metadata.get('file', '')}-{chunk_index}-{content[:50]}"
)
return hashlib.md5(id_string.encode()).hexdigest()
def stream_skill_directory(
self,
skill_dir: Path,
callback: callable | None = None
self, skill_dir: Path, callback: callable | None = None
) -> Iterator[tuple[str, dict]]:
"""
Stream all documents from skill directory.
@@ -218,7 +215,7 @@ class StreamingIngester:
processed_chunks=0,
failed_chunks=0,
bytes_processed=0,
start_time=time.time()
start_time=time.time(),
)
# Process each document
@@ -235,11 +232,13 @@ class StreamingIngester:
"category": category,
"file": filename,
"type": "documentation" if filename == "SKILL.md" else "reference",
"version": "1.0.0"
"version": "1.0.0",
}
# Chunk document and yield chunks
for chunk_count, (chunk_text, chunk_meta) in enumerate(self.chunk_document(content, metadata), start=1):
for chunk_count, (chunk_text, chunk_meta) in enumerate(
self.chunk_document(content, metadata), start=1
):
self.progress.total_chunks += 1
# Convert chunk metadata to dict
@@ -272,9 +271,7 @@ class StreamingIngester:
continue
def batch_iterator(
self,
chunks: Iterator[tuple[str, dict]],
batch_size: int | None = None
self, chunks: Iterator[tuple[str, dict]], batch_size: int | None = None
) -> Iterator[list[tuple[str, dict]]]:
"""
Group chunks into batches for efficient processing.
@@ -321,7 +318,7 @@ class StreamingIngester:
"failed_chunks": self.progress.failed_chunks,
"bytes_processed": self.progress.bytes_processed,
},
"state": state
"state": state,
}
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
@@ -384,23 +381,25 @@ def main():
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
parser.add_argument("input", help="Input file or directory path")
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap in characters")
parser.add_argument(
"--chunk-overlap", type=int, default=200, help="Chunk overlap in characters"
)
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
parser.add_argument("--checkpoint", help="Checkpoint file path")
args = parser.parse_args()
# Initialize ingester
ingester = StreamingIngester(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
batch_size=args.batch_size
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size
)
# Progress callback
def on_progress(progress: IngestionProgress):
if progress.processed_chunks % 10 == 0:
print(f"Progress: {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks")
print(
f"Progress: {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks"
)
# Stream input
input_path = Path(args.input)
@@ -416,17 +415,23 @@ def main():
metadata = {"source": input_path.stem, "file": input_path.name}
file_chunks = ingester.chunk_document(content, metadata)
# Convert to generator format matching stream_skill_directory
chunks = ((text, {
"content": text,
"chunk_id": meta.chunk_id,
"source": meta.source,
"category": meta.category,
"file": meta.file,
"chunk_index": meta.chunk_index,
"total_chunks": meta.total_chunks,
"char_start": meta.char_start,
"char_end": meta.char_end,
}) for text, meta in file_chunks)
chunks = (
(
text,
{
"content": text,
"chunk_id": meta.chunk_id,
"source": meta.source,
"category": meta.category,
"file": meta.file,
"chunk_index": meta.chunk_index,
"total_chunks": meta.total_chunks,
"char_start": meta.char_start,
"char_end": meta.char_end,
},
)
for text, meta in file_chunks
)
# Process in batches
all_chunks = []
@@ -437,8 +442,7 @@ def main():
# Save checkpoint if specified
if args.checkpoint:
ingester.save_checkpoint(
Path(args.checkpoint),
{"processed_batches": len(all_chunks) // args.batch_size}
Path(args.checkpoint), {"processed_batches": len(all_chunks) // args.batch_size}
)
# Final progress
@@ -449,4 +453,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -22,9 +22,7 @@ def handle_signal(_signum, _frame):
def start_command(args):
"""Start monitoring."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=args.interval,
auto_update=args.auto_update
config_path=args.config, check_interval=args.interval, auto_update=args.auto_update
)
# Register signal handlers
@@ -42,6 +40,7 @@ def start_command(args):
# Keep running
while True:
import time
time.sleep(1)
except KeyboardInterrupt:
@@ -53,7 +52,7 @@ def check_command(args):
"""Check for changes once."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=3600 # Not used for single check
check_interval=3600, # Not used for single check
)
print(f"🔍 Checking {args.config} for changes...")
@@ -82,7 +81,7 @@ def check_command(args):
print(f"{change.url}")
if change.diff and args.diff:
print(f" Diff preview (first 5 lines):")
for line in change.diff.split('\n')[:5]:
for line in change.diff.split("\n")[:5]:
print(f" {line}")
if report.deleted:
@@ -95,10 +94,7 @@ def check_command(args):
def stats_command(args):
"""Show monitoring statistics."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=3600
)
monitor = SyncMonitor(config_path=args.config, check_interval=3600)
stats = monitor.stats()
@@ -117,7 +113,7 @@ def reset_command(args):
state_file = Path(f"{args.skill_name}_sync.json")
if state_file.exists():
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == 'y':
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == "y":
state_file.unlink()
print(f"✅ State reset for {args.skill_name}")
else:
@@ -129,7 +125,7 @@ def reset_command(args):
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Monitor documentation for changes and update skills',
description="Monitor documentation for changes and update skills",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -153,52 +149,39 @@ Examples:
# Reset state
skill-seekers-sync reset --skill-name react
"""
""",
)
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Start command
start_parser = subparsers.add_parser('start', help='Start continuous monitoring')
start_parser.add_argument('--config', required=True, help='Path to skill config file')
start_parser = subparsers.add_parser("start", help="Start continuous monitoring")
start_parser.add_argument("--config", required=True, help="Path to skill config file")
start_parser.add_argument(
'--interval', '-i',
"--interval",
"-i",
type=int,
default=3600,
help='Check interval in seconds (default: 3600 = 1 hour)'
help="Check interval in seconds (default: 3600 = 1 hour)",
)
start_parser.add_argument(
'--auto-update',
action='store_true',
help='Automatically rebuild skill on changes'
"--auto-update", action="store_true", help="Automatically rebuild skill on changes"
)
# Check command
check_parser = subparsers.add_parser('check', help='Check for changes once')
check_parser.add_argument('--config', required=True, help='Path to skill config file')
check_parser.add_argument(
'--diff', '-d',
action='store_true',
help='Generate content diffs'
)
check_parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show detailed output'
)
check_parser = subparsers.add_parser("check", help="Check for changes once")
check_parser.add_argument("--config", required=True, help="Path to skill config file")
check_parser.add_argument("--diff", "-d", action="store_true", help="Generate content diffs")
check_parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output")
# Stats command
stats_parser = subparsers.add_parser('stats', help='Show monitoring statistics')
stats_parser.add_argument('--config', required=True, help='Path to skill config file')
stats_parser = subparsers.add_parser("stats", help="Show monitoring statistics")
stats_parser.add_argument("--config", required=True, help="Path to skill config file")
# Reset command
reset_parser = subparsers.add_parser('reset', help='Reset monitoring state')
reset_parser.add_argument('--skill-name', required=True, help='Skill name')
reset_parser.add_argument(
'--force', '-f',
action='store_true',
help='Skip confirmation'
)
reset_parser = subparsers.add_parser("reset", help="Reset monitoring state")
reset_parser.add_argument("--skill-name", required=True, help="Skill name")
reset_parser.add_argument("--force", "-f", action="store_true", help="Skip confirmation")
args = parser.parse_args()
@@ -207,18 +190,18 @@ Examples:
sys.exit(1)
try:
if args.command == 'start':
if args.command == "start":
start_command(args)
elif args.command == 'check':
elif args.command == "check":
check_command(args)
elif args.command == 'stats':
elif args.command == "stats":
stats_command(args)
elif args.command == 'reset':
elif args.command == "reset":
reset_command(args)
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -59,7 +59,7 @@ def upload_skill_api(package_path, target="claude", api_key=None, **kwargs):
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
# API key validation only for platforms that require it
if target in ['claude', 'gemini', 'openai']:
if target in ["claude", "gemini", "openai"]:
if not api_key:
return False, f"{adaptor.get_env_var_name()} not set. Export your API key first."
@@ -172,41 +172,39 @@ Examples:
# ChromaDB upload options
parser.add_argument(
"--chroma-url",
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
)
parser.add_argument(
"--persist-directory",
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
)
parser.add_argument(
"--embedding-function",
choices=["openai", "sentence-transformers", "none"],
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
)
parser.add_argument(
"--openai-api-key",
help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
)
# Weaviate upload options
parser.add_argument(
"--weaviate-url",
default="http://localhost:8080",
help="Weaviate URL (default: http://localhost:8080)"
help="Weaviate URL (default: http://localhost:8080)",
)
parser.add_argument(
"--use-cloud",
action="store_true",
help="Use Weaviate Cloud (requires --api-key and --cluster-url)"
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
)
parser.add_argument(
"--cluster-url",
help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
)
args = parser.parse_args()
@@ -214,28 +212,30 @@ Examples:
# Build kwargs for vector DB upload
upload_kwargs = {}
if args.target == 'chroma':
if args.target == "chroma":
if args.chroma_url:
upload_kwargs['chroma_url'] = args.chroma_url
upload_kwargs["chroma_url"] = args.chroma_url
if args.persist_directory:
upload_kwargs['persist_directory'] = args.persist_directory
upload_kwargs["persist_directory"] = args.persist_directory
if args.embedding_function:
upload_kwargs['embedding_function'] = args.embedding_function
upload_kwargs["embedding_function"] = args.embedding_function
if args.openai_api_key:
upload_kwargs['openai_api_key'] = args.openai_api_key
upload_kwargs["openai_api_key"] = args.openai_api_key
elif args.target == 'weaviate':
upload_kwargs['weaviate_url'] = args.weaviate_url
upload_kwargs['use_cloud'] = args.use_cloud
elif args.target == "weaviate":
upload_kwargs["weaviate_url"] = args.weaviate_url
upload_kwargs["use_cloud"] = args.use_cloud
if args.cluster_url:
upload_kwargs['cluster_url'] = args.cluster_url
upload_kwargs["cluster_url"] = args.cluster_url
if args.embedding_function:
upload_kwargs['embedding_function'] = args.embedding_function
upload_kwargs["embedding_function"] = args.embedding_function
if args.openai_api_key:
upload_kwargs['openai_api_key'] = args.openai_api_key
upload_kwargs["openai_api_key"] = args.openai_api_key
# Upload skill
success, message = upload_skill_api(args.package_file, args.target, args.api_key, **upload_kwargs)
success, message = upload_skill_api(
args.package_file, args.target, args.api_key, **upload_kwargs
)
if success:
sys.exit(0)

View File

@@ -23,9 +23,9 @@ from .generator import EmbeddingGenerator
from .cache import EmbeddingCache
__all__ = [
'EmbeddingRequest',
'EmbeddingResponse',
'BatchEmbeddingRequest',
'EmbeddingGenerator',
'EmbeddingCache',
"EmbeddingRequest",
"EmbeddingResponse",
"BatchEmbeddingRequest",
"EmbeddingGenerator",
"EmbeddingCache",
]

View File

@@ -74,12 +74,7 @@ class EmbeddingCache:
self.conn.commit()
def set(
self,
hash_key: str,
embedding: list[float],
model: str
) -> None:
def set(self, hash_key: str, embedding: list[float], model: str) -> None:
"""
Store embedding in cache.
@@ -94,11 +89,14 @@ class EmbeddingCache:
embedding_json = json.dumps(embedding)
dimensions = len(embedding)
cursor.execute("""
cursor.execute(
"""
INSERT OR REPLACE INTO embeddings
(hash, embedding, model, dimensions, created_at, accessed_at, access_count)
VALUES (?, ?, ?, ?, ?, ?, 1)
""", (hash_key, embedding_json, model, dimensions, now, now))
""",
(hash_key, embedding_json, model, dimensions, now, now),
)
self.conn.commit()
@@ -115,11 +113,14 @@ class EmbeddingCache:
cursor = self.conn.cursor()
# Get embedding
cursor.execute("""
cursor.execute(
"""
SELECT embedding, created_at
FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
row = cursor.fetchone()
if not row:
@@ -136,11 +137,14 @@ class EmbeddingCache:
# Update access stats
now = datetime.utcnow().isoformat()
cursor.execute("""
cursor.execute(
"""
UPDATE embeddings
SET accessed_at = ?, access_count = access_count + 1
WHERE hash = ?
""", (now, hash_key))
""",
(now, hash_key),
)
self.conn.commit()
return json.loads(embedding_json)
@@ -178,11 +182,14 @@ class EmbeddingCache:
"""
cursor = self.conn.cursor()
cursor.execute("""
cursor.execute(
"""
SELECT created_at
FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
row = cursor.fetchone()
if not row:
@@ -206,10 +213,13 @@ class EmbeddingCache:
"""
cursor = self.conn.cursor()
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
self.conn.commit()
@@ -226,10 +236,13 @@ class EmbeddingCache:
cursor = self.conn.cursor()
if model:
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE model = ?
""", (model,))
""",
(model,),
)
else:
cursor.execute("DELETE FROM embeddings")
@@ -249,10 +262,13 @@ class EmbeddingCache:
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE created_at < ?
""", (cutoff,))
""",
(cutoff,),
)
deleted = cursor.rowcount
self.conn.commit()
@@ -300,17 +316,19 @@ class EmbeddingCache:
LIMIT 10
""")
top_accessed = [
{"hash": row[0], "model": row[1], "access_count": row[2]}
for row in cursor.fetchall()
{"hash": row[0], "model": row[1], "access_count": row[2]} for row in cursor.fetchall()
]
# Expired entries
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
cursor.execute("""
cursor.execute(
"""
SELECT COUNT(*)
FROM embeddings
WHERE created_at < ?
""", (cutoff,))
""",
(cutoff,),
)
expired = cursor.fetchone()[0]
return {
@@ -318,7 +336,7 @@ class EmbeddingCache:
"by_model": by_model,
"top_accessed": top_accessed,
"expired": expired,
"ttl_days": self.ttl_days
"ttl_days": self.ttl_days,
}
def close(self):

View File

@@ -9,6 +9,7 @@ import numpy as np
# OpenAI support
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
@@ -16,6 +17,7 @@ except ImportError:
# Sentence transformers support
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
@@ -23,6 +25,7 @@ except ImportError:
# Voyage AI support (recommended by Anthropic for embeddings)
try:
import voyageai
VOYAGE_AVAILABLE = True
except ImportError:
VOYAGE_AVAILABLE = False
@@ -129,7 +132,7 @@ class EmbeddingGenerator:
self,
api_key: str | None = None,
voyage_api_key: str | None = None,
cache_dir: str | None = None
cache_dir: str | None = None,
):
"""
Initialize embedding generator.
@@ -162,8 +165,7 @@ class EmbeddingGenerator:
"""Get information about a model."""
if model not in self.MODELS:
raise ValueError(
f"Unknown model: {model}. "
f"Available models: {', '.join(self.MODELS.keys())}"
f"Unknown model: {model}. Available models: {', '.join(self.MODELS.keys())}"
)
return self.MODELS[model]
@@ -171,20 +173,19 @@ class EmbeddingGenerator:
"""List all available models."""
models = []
for name, info in self.MODELS.items():
models.append({
"name": name,
"provider": info["provider"],
"dimensions": info["dimensions"],
"max_tokens": info["max_tokens"],
"cost_per_million": info.get("cost_per_million", 0.0),
})
models.append(
{
"name": name,
"provider": info["provider"],
"dimensions": info["dimensions"],
"max_tokens": info["max_tokens"],
"cost_per_million": info.get("cost_per_million", 0.0),
}
)
return models
def generate(
self,
text: str,
model: str = "text-embedding-3-small",
normalize: bool = True
self, text: str, model: str = "text-embedding-3-small", normalize: bool = True
) -> list[float]:
"""
Generate embedding for a single text.
@@ -218,7 +219,7 @@ class EmbeddingGenerator:
texts: list[str],
model: str = "text-embedding-3-small",
normalize: bool = True,
batch_size: int = 32
batch_size: int = 32,
) -> tuple[list[list[float]], int]:
"""
Generate embeddings for multiple texts.
@@ -248,24 +249,18 @@ class EmbeddingGenerator:
else:
raise ValueError(f"Unsupported provider: {provider}")
def _generate_openai(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_openai(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using OpenAI API."""
if not OPENAI_AVAILABLE:
raise ImportError(
"OpenAI is required for OpenAI embeddings. "
"Install with: pip install openai"
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
)
if not self.openai_client:
raise ValueError("OpenAI API key not provided")
try:
response = self.openai_client.embeddings.create(
input=text,
model=model
)
response = self.openai_client.embeddings.create(input=text, model=model)
embedding = response.data[0].embedding
if normalize:
@@ -281,8 +276,7 @@ class EmbeddingGenerator:
"""Generate embeddings using OpenAI API in batches."""
if not OPENAI_AVAILABLE:
raise ImportError(
"OpenAI is required for OpenAI embeddings. "
"Install with: pip install openai"
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
)
if not self.openai_client:
@@ -292,13 +286,10 @@ class EmbeddingGenerator:
# Process in batches
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch = texts[i : i + batch_size]
try:
response = self.openai_client.embeddings.create(
input=batch,
model=model
)
response = self.openai_client.embeddings.create(input=batch, model=model)
batch_embeddings = [item.embedding for item in response.data]
@@ -313,24 +304,18 @@ class EmbeddingGenerator:
dimensions = len(all_embeddings[0]) if all_embeddings else 0
return all_embeddings, dimensions
def _generate_voyage(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_voyage(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using Voyage AI API."""
if not VOYAGE_AVAILABLE:
raise ImportError(
"voyageai is required for Voyage AI embeddings. "
"Install with: pip install voyageai"
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
)
if not self.voyage_client:
raise ValueError("Voyage API key not provided")
try:
result = self.voyage_client.embed(
texts=[text],
model=model
)
result = self.voyage_client.embed(texts=[text], model=model)
embedding = result.embeddings[0]
if normalize:
@@ -346,8 +331,7 @@ class EmbeddingGenerator:
"""Generate embeddings using Voyage AI API in batches."""
if not VOYAGE_AVAILABLE:
raise ImportError(
"voyageai is required for Voyage AI embeddings. "
"Install with: pip install voyageai"
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
)
if not self.voyage_client:
@@ -357,13 +341,10 @@ class EmbeddingGenerator:
# Process in batches (Voyage AI supports up to 128 texts per request)
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch = texts[i : i + batch_size]
try:
result = self.voyage_client.embed(
texts=batch,
model=model
)
result = self.voyage_client.embed(texts=batch, model=model)
batch_embeddings = result.embeddings
@@ -378,9 +359,7 @@ class EmbeddingGenerator:
dimensions = len(all_embeddings[0]) if all_embeddings else 0
return all_embeddings, dimensions
def _generate_sentence_transformer(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_sentence_transformer(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using sentence-transformers."""
if not SENTENCE_TRANSFORMERS_AVAILABLE:
raise ImportError(
@@ -417,10 +396,7 @@ class EmbeddingGenerator:
# Generate embeddings in batches
embeddings = st_model.encode(
texts,
batch_size=batch_size,
normalize_embeddings=normalize,
show_progress_bar=False
texts, batch_size=batch_size, normalize_embeddings=normalize, show_progress_bar=False
)
dimensions = len(embeddings[0]) if len(embeddings) > 0 else 0

View File

@@ -14,20 +14,14 @@ class EmbeddingRequest(BaseModel):
"example": {
"text": "This is a test document about Python programming.",
"model": "text-embedding-3-small",
"normalize": True
"normalize": True,
}
}
)
text: str = Field(..., description="Text to generate embedding for")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
normalize: bool = Field(
default=True,
description="Normalize embeddings to unit length"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
class BatchEmbeddingRequest(BaseModel):
@@ -39,27 +33,20 @@ class BatchEmbeddingRequest(BaseModel):
"texts": [
"First document about Python",
"Second document about JavaScript",
"Third document about Rust"
"Third document about Rust",
],
"model": "text-embedding-3-small",
"normalize": True,
"batch_size": 32
"batch_size": 32,
}
}
)
texts: list[str] = Field(..., description="List of texts to embed")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
normalize: bool = Field(
default=True,
description="Normalize embeddings to unit length"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
batch_size: int | None = Field(
default=32,
description="Batch size for processing (default: 32)"
default=32, description="Batch size for processing (default: 32)"
)
@@ -69,10 +56,7 @@ class EmbeddingResponse(BaseModel):
embedding: list[float] = Field(..., description="Generated embedding vector")
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
cached: bool = Field(
default=False,
description="Whether embedding was retrieved from cache"
)
cached: bool = Field(default=False, description="Whether embedding was retrieved from cache")
class BatchEmbeddingResponse(BaseModel):
@@ -82,10 +66,7 @@ class BatchEmbeddingResponse(BaseModel):
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
count: int = Field(..., description="Number of embeddings generated")
cached_count: int = Field(
default=0,
description="Number of embeddings retrieved from cache"
)
cached_count: int = Field(default=0, description="Number of embeddings retrieved from cache")
class SkillEmbeddingRequest(BaseModel):
@@ -97,24 +78,15 @@ class SkillEmbeddingRequest(BaseModel):
"skill_path": "/path/to/skill/react",
"model": "text-embedding-3-small",
"chunk_size": 512,
"overlap": 50
"overlap": 50,
}
}
)
skill_path: str = Field(..., description="Path to skill directory")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
chunk_size: int = Field(
default=512,
description="Chunk size for splitting documents (tokens)"
)
overlap: int = Field(
default=50,
description="Overlap between chunks (tokens)"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
chunk_size: int = Field(default=512, description="Chunk size for splitting documents (tokens)")
overlap: int = Field(default=50, description="Overlap between chunks (tokens)")
class SkillEmbeddingResponse(BaseModel):
@@ -124,10 +96,7 @@ class SkillEmbeddingResponse(BaseModel):
total_chunks: int = Field(..., description="Total number of chunks embedded")
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Skill metadata"
)
metadata: dict[str, Any] = Field(default_factory=dict, description="Skill metadata")
class HealthResponse(BaseModel):
@@ -144,12 +113,13 @@ class ModelInfo(BaseModel):
"""Information about an embedding model."""
name: str = Field(..., description="Model name")
provider: str = Field(..., description="Model provider (openai, anthropic, sentence-transformers)")
provider: str = Field(
..., description="Model provider (openai, anthropic, sentence-transformers)"
)
dimensions: int = Field(..., description="Embedding dimensions")
max_tokens: int = Field(..., description="Maximum input tokens")
cost_per_million: float | None = Field(
None,
description="Cost per million tokens (if applicable)"
None, description="Cost per million tokens (if applicable)"
)

View File

@@ -25,6 +25,7 @@ try:
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
FASTAPI_AVAILABLE = True
except ImportError:
FASTAPI_AVAILABLE = False
@@ -51,7 +52,7 @@ if FASTAPI_AVAILABLE:
description="Generate embeddings for text and skill content",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc"
redoc_url="/redoc",
)
# Add CORS middleware
@@ -64,13 +65,14 @@ if FASTAPI_AVAILABLE:
)
# Initialize generator and cache
cache_dir = os.getenv("EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings"))
cache_dir = os.getenv(
"EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings")
)
cache_db = os.path.join(cache_dir, "embeddings.db")
cache_enabled = os.getenv("EMBEDDING_CACHE_ENABLED", "true").lower() == "true"
generator = EmbeddingGenerator(
api_key=os.getenv("OPENAI_API_KEY"),
voyage_api_key=os.getenv("VOYAGE_API_KEY")
api_key=os.getenv("OPENAI_API_KEY"), voyage_api_key=os.getenv("VOYAGE_API_KEY")
)
cache = EmbeddingCache(cache_db) if cache_enabled else None
@@ -81,7 +83,7 @@ if FASTAPI_AVAILABLE:
"service": "Skill Seekers Embedding API",
"version": "1.0.0",
"docs": "/docs",
"health": "/health"
"health": "/health",
}
@app.get("/health", response_model=HealthResponse)
@@ -95,7 +97,7 @@ if FASTAPI_AVAILABLE:
version="1.0.0",
models=models,
cache_enabled=cache_enabled,
cache_size=cache_size
cache_size=cache_size,
)
@app.get("/models", response_model=ModelsResponse)
@@ -109,15 +111,12 @@ if FASTAPI_AVAILABLE:
provider=m["provider"],
dimensions=m["dimensions"],
max_tokens=m["max_tokens"],
cost_per_million=m.get("cost_per_million")
cost_per_million=m.get("cost_per_million"),
)
for m in models_list
]
return ModelsResponse(
models=model_infos,
count=len(model_infos)
)
return ModelsResponse(models=model_infos, count=len(model_infos))
@app.post("/embed", response_model=EmbeddingResponse)
async def embed_text(request: EmbeddingRequest):
@@ -144,9 +143,7 @@ if FASTAPI_AVAILABLE:
else:
# Generate embedding
embedding = generator.generate(
request.text,
model=request.model,
normalize=request.normalize
request.text, model=request.model, normalize=request.normalize
)
# Store in cache
@@ -154,10 +151,7 @@ if FASTAPI_AVAILABLE:
cache.set(hash_key, embedding, request.model)
return EmbeddingResponse(
embedding=embedding,
model=request.model,
dimensions=len(embedding),
cached=cached
embedding=embedding, model=request.model, dimensions=len(embedding), cached=cached
)
except Exception as e:
@@ -202,11 +196,13 @@ if FASTAPI_AVAILABLE:
texts_to_generate,
model=request.model,
normalize=request.normalize,
batch_size=request.batch_size
batch_size=request.batch_size,
)
# Fill in placeholders and cache
for idx, text, embedding in zip(text_indices, texts_to_generate, generated_embeddings, strict=False):
for idx, text, embedding in zip(
text_indices, texts_to_generate, generated_embeddings, strict=False
):
embeddings[idx] = embedding
if cache:
@@ -220,7 +216,7 @@ if FASTAPI_AVAILABLE:
model=request.model,
dimensions=dimensions,
count=len(embeddings),
cached_count=cached_count
cached_count=cached_count,
)
except Exception as e:
@@ -244,12 +240,16 @@ if FASTAPI_AVAILABLE:
skill_path = Path(request.skill_path)
if not skill_path.exists():
raise HTTPException(status_code=404, detail=f"Skill path not found: {request.skill_path}")
raise HTTPException(
status_code=404, detail=f"Skill path not found: {request.skill_path}"
)
# Read SKILL.md
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
raise HTTPException(status_code=404, detail=f"SKILL.md not found in {request.skill_path}")
raise HTTPException(
status_code=404, detail=f"SKILL.md not found in {request.skill_path}"
)
skill_content = skill_md.read_text()
@@ -262,10 +262,7 @@ if FASTAPI_AVAILABLE:
# Generate embeddings for chunks
embeddings, dimensions = generator.generate_batch(
chunks,
model=request.model,
normalize=True,
batch_size=32
chunks, model=request.model, normalize=True, batch_size=32
)
# TODO: Store embeddings in vector database
@@ -279,8 +276,8 @@ if FASTAPI_AVAILABLE:
metadata={
"skill_path": str(skill_path),
"chunks": len(chunks),
"content_length": len(skill_content)
}
"content_length": len(skill_content),
},
)
except HTTPException:
@@ -298,7 +295,7 @@ if FASTAPI_AVAILABLE:
@app.post("/cache/clear", response_model=dict)
async def clear_cache(
model: str | None = Query(None, description="Model to clear (all if not specified)")
model: str | None = Query(None, description="Model to clear (all if not specified)"),
):
"""Clear cache entries."""
if not cache:
@@ -306,11 +303,7 @@ if FASTAPI_AVAILABLE:
deleted = cache.clear(model=model)
return {
"status": "ok",
"deleted": deleted,
"model": model or "all"
}
return {"status": "ok", "deleted": deleted, "model": model or "all"}
@app.post("/cache/clear-expired", response_model=dict)
async def clear_expired():
@@ -320,10 +313,7 @@ if FASTAPI_AVAILABLE:
deleted = cache.clear_expired()
return {
"status": "ok",
"deleted": deleted
}
return {"status": "ok", "deleted": deleted}
else:
print("Error: FastAPI not available. Install with: pip install fastapi uvicorn")
@@ -348,12 +338,7 @@ def main():
if cache_enabled:
print(f"💾 Cache database: {cache_db}")
uvicorn.run(
"skill_seekers.embedding.server:app",
host=host,
port=port,
reload=reload
)
uvicorn.run("skill_seekers.embedding.server:app", host=host, port=port, reload=reload)
if __name__ == "__main__":

View File

@@ -69,15 +69,17 @@ async def generate_config(args: dict) -> list[TextContent]:
config = {
"name": name,
"description": description,
"sources": [{
"type": "documentation",
"base_url": url,
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
"rate_limit": rate_limit,
"max_pages": max_pages,
}],
"sources": [
{
"type": "documentation",
"base_url": url,
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
"rate_limit": rate_limit,
"max_pages": max_pages,
}
],
}
# Save to configs directory

View File

@@ -32,9 +32,9 @@ from .detector import ChangeDetector
from .models import SyncConfig, ChangeReport, PageChange
__all__ = [
'SyncMonitor',
'ChangeDetector',
'SyncConfig',
'ChangeReport',
'PageChange',
"SyncMonitor",
"ChangeDetector",
"SyncConfig",
"ChangeReport",
"PageChange",
]

View File

@@ -55,7 +55,7 @@ class ChangeDetector:
Returns:
Hexadecimal hash string
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest()
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def fetch_page(self, url: str) -> tuple[str, dict[str, str]]:
"""
@@ -72,17 +72,15 @@ class ChangeDetector:
requests.RequestException: If fetch fails
"""
response = requests.get(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
metadata = {
'last-modified': response.headers.get('Last-Modified'),
'etag': response.headers.get('ETag'),
'content-type': response.headers.get('Content-Type'),
'content-length': response.headers.get('Content-Length'),
"last-modified": response.headers.get("Last-Modified"),
"etag": response.headers.get("ETag"),
"content-type": response.headers.get("Content-Type"),
"content-length": response.headers.get("Content-Length"),
}
return response.text, metadata
@@ -92,7 +90,7 @@ class ChangeDetector:
url: str,
old_hash: str | None = None,
generate_diff: bool = False,
old_content: str | None = None
old_content: str | None = None,
) -> PageChange:
"""
Check if page has changed.
@@ -132,7 +130,7 @@ class ChangeDetector:
old_hash=old_hash,
new_hash=new_hash,
diff=diff,
detected_at=datetime.utcnow()
detected_at=datetime.utcnow(),
)
except requests.RequestException:
@@ -142,14 +140,11 @@ class ChangeDetector:
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
detected_at=datetime.utcnow(),
)
def check_pages(
self,
urls: list[str],
previous_hashes: dict[str, str],
generate_diffs: bool = False
self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False
) -> ChangeReport:
"""
Check multiple pages for changes.
@@ -185,13 +180,15 @@ class ChangeDetector:
# Check for deleted pages (in previous state but not in current)
for url, old_hash in previous_hashes.items():
if url not in checked_urls:
deleted.append(PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
))
deleted.append(
PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow(),
)
)
return ChangeReport(
skill_name="unknown", # To be set by caller
@@ -200,7 +197,7 @@ class ChangeDetector:
modified=modified,
deleted=deleted,
unchanged=unchanged_count,
checked_at=datetime.utcnow()
checked_at=datetime.utcnow(),
)
def generate_diff(self, old_content: str, new_content: str) -> str:
@@ -217,15 +214,9 @@ class ChangeDetector:
old_lines = old_content.splitlines(keepends=True)
new_lines = new_content.splitlines(keepends=True)
diff = difflib.unified_diff(
old_lines,
new_lines,
fromfile='old',
tofile='new',
lineterm=''
)
diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="")
return ''.join(diff)
return "".join(diff)
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
"""
@@ -244,16 +235,15 @@ class ChangeDetector:
diff = difflib.unified_diff(old_lines, new_lines)
diff_lines = list(diff)
added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++"))
removed = sum(
1 for line in diff_lines if line.startswith("-") and not line.startswith("---")
)
return f"+{added} -{removed} lines"
def check_header_changes(
self,
url: str,
old_modified: str | None = None,
old_etag: str | None = None
self, url: str, old_modified: str | None = None, old_etag: str | None = None
) -> bool:
"""
Quick check using HTTP headers (no content download).
@@ -269,14 +259,12 @@ class ChangeDetector:
try:
# Use HEAD request for efficiency
response = requests.head(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
new_modified = response.headers.get('Last-Modified')
new_etag = response.headers.get('ETag')
new_modified = response.headers.get("Last-Modified")
new_etag = response.headers.get("ETag")
# Check if headers indicate change
if old_modified and new_modified and old_modified != new_modified:
@@ -289,9 +277,7 @@ class ChangeDetector:
return True
def batch_check_headers(
self,
urls: list[str],
previous_metadata: dict[str, dict[str, str]]
self, urls: list[str], previous_metadata: dict[str, dict[str, str]]
) -> list[str]:
"""
Batch check URLs using headers only.
@@ -307,8 +293,8 @@ class ChangeDetector:
for url in urls:
old_meta = previous_metadata.get(url, {})
old_modified = old_meta.get('last-modified')
old_etag = old_meta.get('etag')
old_modified = old_meta.get("last-modified")
old_etag = old_meta.get("etag")
if self.check_header_changes(url, old_modified, old_etag):
changed_urls.append(url)

View File

@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
class ChangeType(str, Enum):
"""Type of change detected."""
ADDED = "added"
MODIFIED = "modified"
DELETED = "deleted"
@@ -25,8 +26,7 @@ class PageChange(BaseModel):
new_hash: str | None = Field(None, description="New content hash")
diff: str | None = Field(None, description="Content diff (if available)")
detected_at: datetime = Field(
default_factory=datetime.utcnow,
description="When change was detected"
default_factory=datetime.utcnow, description="When change was detected"
)
class Config:
@@ -37,7 +37,7 @@ class PageChange(BaseModel):
"old_hash": "abc123",
"new_hash": "def456",
"diff": "@@ -10,3 +10,4 @@\n+New content here",
"detected_at": "2024-01-15T10:30:00Z"
"detected_at": "2024-01-15T10:30:00Z",
}
}
@@ -52,8 +52,7 @@ class ChangeReport(BaseModel):
deleted: list[PageChange] = Field(default_factory=list, description="Deleted pages")
unchanged: int = Field(0, description="Number of unchanged pages")
checked_at: datetime = Field(
default_factory=datetime.utcnow,
description="When check was performed"
default_factory=datetime.utcnow, description="When check was performed"
)
@property
@@ -72,34 +71,19 @@ class SyncConfig(BaseModel):
skill_config: str = Field(..., description="Path to skill config file")
check_interval: int = Field(
default=3600,
description="Check interval in seconds (default: 1 hour)"
default=3600, description="Check interval in seconds (default: 1 hour)"
)
enabled: bool = Field(default=True, description="Whether sync is enabled")
auto_update: bool = Field(
default=False,
description="Automatically rebuild skill on changes"
)
notify_on_change: bool = Field(
default=True,
description="Send notifications on changes"
)
auto_update: bool = Field(default=False, description="Automatically rebuild skill on changes")
notify_on_change: bool = Field(default=True, description="Send notifications on changes")
notification_channels: list[str] = Field(
default_factory=list,
description="Notification channels (email, slack, webhook)"
)
webhook_url: str | None = Field(
None,
description="Webhook URL for change notifications"
default_factory=list, description="Notification channels (email, slack, webhook)"
)
webhook_url: str | None = Field(None, description="Webhook URL for change notifications")
email_recipients: list[str] = Field(
default_factory=list,
description="Email recipients for notifications"
)
slack_webhook: str | None = Field(
None,
description="Slack webhook URL"
default_factory=list, description="Email recipients for notifications"
)
slack_webhook: str | None = Field(None, description="Slack webhook URL")
class Config:
json_schema_extra = {
@@ -111,7 +95,7 @@ class SyncConfig(BaseModel):
"notify_on_change": True,
"notification_channels": ["slack", "webhook"],
"webhook_url": "https://example.com/webhook",
"slack_webhook": "https://hooks.slack.com/services/..."
"slack_webhook": "https://hooks.slack.com/services/...",
}
}
@@ -125,8 +109,7 @@ class SyncState(BaseModel):
total_checks: int = Field(default=0, description="Total checks performed")
total_changes: int = Field(default=0, description="Total changes detected")
page_hashes: dict[str, str] = Field(
default_factory=dict,
description="URL -> content hash mapping"
default_factory=dict, description="URL -> content hash mapping"
)
status: str = Field(default="idle", description="Current status")
error: str | None = Field(None, description="Last error message")
@@ -137,15 +120,9 @@ class WebhookPayload(BaseModel):
event: str = Field(..., description="Event type (change_detected, sync_complete)")
skill_name: str = Field(..., description="Skill name")
timestamp: datetime = Field(
default_factory=datetime.utcnow,
description="Event timestamp"
)
timestamp: datetime = Field(default_factory=datetime.utcnow, description="Event timestamp")
changes: ChangeReport | None = Field(None, description="Change report")
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Additional metadata"
)
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
class Config:
json_schema_extra = {
@@ -157,8 +134,8 @@ class WebhookPayload(BaseModel):
"total_pages": 150,
"added": [],
"modified": [{"url": "https://react.dev/learn"}],
"deleted": []
"deleted": [],
},
"metadata": {"source": "periodic_check"}
"metadata": {"source": "periodic_check"},
}
}

View File

@@ -51,7 +51,7 @@ class SyncMonitor:
check_interval: int = 3600,
auto_update: bool = False,
state_file: str | None = None,
on_change: Callable[[ChangeReport], None] | None = None
on_change: Callable[[ChangeReport], None] | None = None,
):
"""
Initialize sync monitor.
@@ -72,7 +72,7 @@ class SyncMonitor:
with open(self.config_path) as f:
self.skill_config = json.load(f)
self.skill_name = self.skill_config.get('name', 'unknown')
self.skill_name = self.skill_config.get("name", "unknown")
# State file
if state_file:
@@ -97,10 +97,10 @@ class SyncMonitor:
with open(self.state_file) as f:
data = json.load(f)
# Convert datetime strings back
if data.get('last_check'):
data['last_check'] = datetime.fromisoformat(data['last_check'])
if data.get('last_change'):
data['last_change'] = datetime.fromisoformat(data['last_change'])
if data.get("last_check"):
data["last_check"] = datetime.fromisoformat(data["last_check"])
if data.get("last_change"):
data["last_change"] = datetime.fromisoformat(data["last_change"])
return SyncState(**data)
else:
return SyncState(skill_name=self.skill_name)
@@ -109,12 +109,12 @@ class SyncMonitor:
"""Save current state to file."""
# Convert datetime to ISO format
data = self.state.dict()
if data.get('last_check'):
data['last_check'] = data['last_check'].isoformat()
if data.get('last_change'):
data['last_change'] = data['last_change'].isoformat()
if data.get("last_check"):
data["last_check"] = data["last_check"].isoformat()
if data.get("last_change"):
data["last_change"] = data["last_change"].isoformat()
with open(self.state_file, 'w') as f:
with open(self.state_file, "w") as f:
json.dump(data, f, indent=2)
def check_now(self, generate_diffs: bool = False) -> ChangeReport:
@@ -132,7 +132,7 @@ class SyncMonitor:
try:
# Get URLs to check from config
base_url = self.skill_config.get('base_url')
base_url = self.skill_config.get("base_url")
# TODO: In real implementation, get actual URLs from scraper
# For now, simulate with base URL only
@@ -140,9 +140,7 @@ class SyncMonitor:
# Check for changes
report = self.detector.check_pages(
urls=urls,
previous_hashes=self.state.page_hashes,
generate_diffs=generate_diffs
urls=urls, previous_hashes=self.state.page_hashes, generate_diffs=generate_diffs
)
report.skill_name = self.skill_name
@@ -192,7 +190,7 @@ class SyncMonitor:
event="change_detected",
skill_name=self.skill_name,
changes=report,
metadata={"auto_update": self.auto_update}
metadata={"auto_update": self.auto_update},
)
self.notifier.send(payload)
@@ -214,9 +212,7 @@ class SyncMonitor:
self._running = True
# Schedule checks
schedule.every(self.check_interval).seconds.do(
lambda: self.check_now()
)
schedule.every(self.check_interval).seconds.do(lambda: self.check_now())
# Run in thread
def run_schedule():

View File

@@ -34,7 +34,7 @@ class Notifier:
webhook_url: str | None = None,
slack_webhook: str | None = None,
email_recipients: list[str] | None = None,
console: bool = True
console: bool = True,
):
"""
Initialize notifier.
@@ -45,8 +45,8 @@ class Notifier:
email_recipients: List of email recipients
console: Whether to print to console
"""
self.webhook_url = webhook_url or os.getenv('SYNC_WEBHOOK_URL')
self.slack_webhook = slack_webhook or os.getenv('SLACK_WEBHOOK_URL')
self.webhook_url = webhook_url or os.getenv("SYNC_WEBHOOK_URL")
self.slack_webhook = slack_webhook or os.getenv("SLACK_WEBHOOK_URL")
self.email_recipients = email_recipients or []
self.console = console
@@ -92,8 +92,8 @@ class Notifier:
response = requests.post(
self.webhook_url,
json=payload.dict(),
headers={'Content-Type': 'application/json'},
timeout=10
headers={"Content-Type": "application/json"},
timeout=10,
)
response.raise_for_status()
print(f"✅ Webhook notification sent to {self.webhook_url}")
@@ -124,14 +124,10 @@ class Notifier:
slack_payload = {
"text": text,
"username": "Skill Seekers Sync",
"icon_emoji": ":books:"
"icon_emoji": ":books:",
}
response = requests.post(
self.slack_webhook,
json=slack_payload,
timeout=10
)
response = requests.post(self.slack_webhook, json=slack_payload, timeout=10)
response.raise_for_status()
print("✅ Slack notification sent")
except Exception as e:

View File

@@ -85,9 +85,17 @@ class TestAdaptorBenchmarks(unittest.TestCase):
# Platforms to benchmark
platforms = [
"claude", "gemini", "openai", "markdown", # IDE integrations
"langchain", "llama-index", "haystack", # RAG frameworks
"weaviate", "chroma", "faiss", "qdrant" # Vector DBs
"claude",
"gemini",
"openai",
"markdown", # IDE integrations
"langchain",
"llama-index",
"haystack", # RAG frameworks
"weaviate",
"chroma",
"faiss",
"qdrant", # Vector DBs
]
results = {}
@@ -115,20 +123,19 @@ class TestAdaptorBenchmarks(unittest.TestCase):
min_time = min(times)
max_time = max(times)
results[platform] = {
"avg": avg_time,
"min": min_time,
"max": max_time
}
results[platform] = {"avg": avg_time, "min": min_time, "max": max_time}
print(f"{platform:15} - Avg: {avg_time*1000:6.2f}ms | "
f"Min: {min_time*1000:6.2f}ms | Max: {max_time*1000:6.2f}ms")
print(
f"{platform:15} - Avg: {avg_time * 1000:6.2f}ms | "
f"Min: {min_time * 1000:6.2f}ms | Max: {max_time * 1000:6.2f}ms"
)
# Performance assertions (should complete in reasonable time)
for platform, metrics in results.items():
self.assertLess(
metrics["avg"], 0.5, # Should average < 500ms
f"{platform} format_skill_md too slow: {metrics['avg']*1000:.2f}ms"
metrics["avg"],
0.5, # Should average < 500ms
f"{platform} format_skill_md too slow: {metrics['avg'] * 1000:.2f}ms",
)
def test_benchmark_package_operations(self):
@@ -158,12 +165,9 @@ class TestAdaptorBenchmarks(unittest.TestCase):
# Get file size
file_size_kb = package_path.stat().st_size / 1024
results[platform] = {
"time": elapsed,
"size_kb": file_size_kb
}
results[platform] = {"time": elapsed, "size_kb": file_size_kb}
print(f"{platform:15} - Time: {elapsed*1000:7.2f}ms | Size: {file_size_kb:7.1f} KB")
print(f"{platform:15} - Time: {elapsed * 1000:7.2f}ms | Size: {file_size_kb:7.1f} KB")
# Validate output
self.assertTrue(package_path.exists())
@@ -171,12 +175,14 @@ class TestAdaptorBenchmarks(unittest.TestCase):
# Performance assertions
for platform, metrics in results.items():
self.assertLess(
metrics["time"], 1.0, # Should complete < 1 second
f"{platform} packaging too slow: {metrics['time']*1000:.2f}ms"
metrics["time"],
1.0, # Should complete < 1 second
f"{platform} packaging too slow: {metrics['time'] * 1000:.2f}ms",
)
self.assertLess(
metrics["size_kb"], 1000, # Should be < 1MB for 10 refs
f"{platform} package too large: {metrics['size_kb']:.1f}KB"
metrics["size_kb"],
1000, # Should be < 1MB for 10 refs
f"{platform} package too large: {metrics['size_kb']:.1f}KB",
)
def test_benchmark_scaling_with_reference_count(self):
@@ -210,14 +216,18 @@ class TestAdaptorBenchmarks(unittest.TestCase):
json.loads(formatted)
size_kb = len(formatted) / 1024
results.append({
"count": ref_count,
"time": elapsed,
"time_per_ref": time_per_ref,
"size_kb": size_kb
})
results.append(
{
"count": ref_count,
"time": elapsed,
"time_per_ref": time_per_ref,
"size_kb": size_kb,
}
)
print(f"{ref_count:4} | {elapsed*1000:10.2f} | {time_per_ref*1000:10.3f} | {size_kb:10.1f}")
print(
f"{ref_count:4} | {elapsed * 1000:10.2f} | {time_per_ref * 1000:10.3f} | {size_kb:10.1f}"
)
# Analyze scaling behavior
# Time per ref should not increase significantly (linear scaling)
@@ -230,10 +240,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
print(f"(Time per ref at 50 refs / Time per ref at 1 ref)")
# Assert linear or sub-linear scaling (not exponential)
self.assertLess(
scaling_factor, 3.0,
f"Non-linear scaling detected: {scaling_factor:.2f}x"
)
self.assertLess(scaling_factor, 3.0, f"Non-linear scaling detected: {scaling_factor:.2f}x")
def test_benchmark_json_vs_zip_size_comparison(self):
"""Compare output sizes: JSON vs ZIP/tar.gz"""
@@ -263,16 +270,15 @@ class TestAdaptorBenchmarks(unittest.TestCase):
size_kb = package_path.stat().st_size / 1024
results[platform] = {
"format": format_name,
"size_kb": size_kb
}
results[platform] = {"format": format_name, "size_kb": size_kb}
print(f"{platform:15} | {format_name:8} | {size_kb:10.1f}")
# Analyze results
json_sizes = [v["size_kb"] for k, v in results.items() if v["format"] == "JSON"]
compressed_sizes = [v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"]]
compressed_sizes = [
v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"]
]
if json_sizes and compressed_sizes:
avg_json = sum(json_sizes) / len(json_sizes)
@@ -280,7 +286,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
print(f"\nAverage JSON size: {avg_json:.1f} KB")
print(f"Average compressed size: {avg_compressed:.1f} KB")
print(f"Compression ratio: {avg_json/avg_compressed:.2f}x")
print(f"Compression ratio: {avg_json / avg_compressed:.2f}x")
def test_benchmark_metadata_overhead(self):
"""Measure metadata processing overhead"""
@@ -299,7 +305,7 @@ class TestAdaptorBenchmarks(unittest.TestCase):
description="A comprehensive test skill for benchmarking purposes",
version="2.5.0",
author="Benchmark Suite",
tags=["test", "benchmark", "performance", "validation", "quality"]
tags=["test", "benchmark", "performance", "validation", "quality"],
)
adaptor = get_adaptor("langchain")
@@ -326,15 +332,12 @@ class TestAdaptorBenchmarks(unittest.TestCase):
overhead = avg_rich - avg_minimal
overhead_pct = (overhead / avg_minimal) * 100
print(f"\nMinimal metadata: {avg_minimal*1000:.2f}ms")
print(f"Rich metadata: {avg_rich*1000:.2f}ms")
print(f"Overhead: {overhead*1000:.2f}ms ({overhead_pct:.1f}%)")
print(f"\nMinimal metadata: {avg_minimal * 1000:.2f}ms")
print(f"Rich metadata: {avg_rich * 1000:.2f}ms")
print(f"Overhead: {overhead * 1000:.2f}ms ({overhead_pct:.1f}%)")
# Overhead should be negligible (< 10%)
self.assertLess(
overhead_pct, 10.0,
f"Metadata overhead too high: {overhead_pct:.1f}%"
)
self.assertLess(overhead_pct, 10.0, f"Metadata overhead too high: {overhead_pct:.1f}%")
def test_benchmark_empty_vs_full_skill(self):
"""Compare performance: empty skill vs full skill"""
@@ -360,9 +363,9 @@ class TestAdaptorBenchmarks(unittest.TestCase):
adaptor.format_skill_md(full_dir, metadata)
full_time = time.perf_counter() - start
print(f"\nEmpty skill: {empty_time*1000:.2f}ms")
print(f"Full skill (50 refs): {full_time*1000:.2f}ms")
print(f"Ratio: {full_time/empty_time:.1f}x")
print(f"\nEmpty skill: {empty_time * 1000:.2f}ms")
print(f"Full skill (50 refs): {full_time * 1000:.2f}ms")
print(f"Ratio: {full_time / empty_time:.1f}x")
# Empty should be very fast
self.assertLess(empty_time, 0.01, "Empty skill processing too slow")

View File

@@ -662,8 +662,13 @@ export default {
def test_e2e_all_rag_adaptors_from_same_skill(self):
"""Test all 7 RAG adaptors can package the same skill"""
rag_platforms = [
"langchain", "llama-index", "haystack",
"weaviate", "chroma", "faiss", "qdrant"
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
]
packages = {}
@@ -674,15 +679,11 @@ export default {
package_path = adaptor.package(self.skill_dir, self.output_dir)
# Verify package was created
self.assertTrue(
package_path.exists(),
f"Package not created for {platform}"
)
self.assertTrue(package_path.exists(), f"Package not created for {platform}")
# Verify it's a JSON file
self.assertTrue(
str(package_path).endswith(".json"),
f"{platform} should produce JSON file"
str(package_path).endswith(".json"), f"{platform} should produce JSON file"
)
# Store for later verification
@@ -696,10 +697,7 @@ export default {
with open(path) as f:
data = json.load(f)
# Should be valid JSON (dict or list)
self.assertIsInstance(
data, (dict, list),
f"{platform} should produce valid JSON"
)
self.assertIsInstance(data, (dict, list), f"{platform} should produce valid JSON")
def test_e2e_rag_adaptors_preserve_metadata(self):
"""Test that metadata is preserved across RAG adaptors"""
@@ -708,7 +706,7 @@ export default {
description="Vue.js framework skill",
version="2.0.0",
author="Test Author",
tags=["vue", "javascript", "frontend"]
tags=["vue", "javascript", "frontend"],
)
# Test subset of platforms (representative sample)
@@ -758,33 +756,30 @@ export default {
# Define expected structure for each platform
validations = {
"langchain": lambda d: (
isinstance(d, list) and
all("page_content" in item and "metadata" in item for item in d)
isinstance(d, list)
and all("page_content" in item and "metadata" in item for item in d)
),
"llama-index": lambda d: (
isinstance(d, list) and
all("text" in item and "metadata" in item for item in d)
isinstance(d, list) and all("text" in item and "metadata" in item for item in d)
),
"haystack": lambda d: (
isinstance(d, list) and
all("content" in item and "meta" in item for item in d)
isinstance(d, list) and all("content" in item and "meta" in item for item in d)
),
"weaviate": lambda d: (
isinstance(d, dict) and
"schema" in d and "objects" in d and "class_name" in d
isinstance(d, dict) and "schema" in d and "objects" in d and "class_name" in d
),
"chroma": lambda d: (
isinstance(d, dict) and
"documents" in d and "metadatas" in d and "ids" in d and
"collection_name" in d
isinstance(d, dict)
and "documents" in d
and "metadatas" in d
and "ids" in d
and "collection_name" in d
),
"faiss": lambda d: (
isinstance(d, dict) and
"documents" in d and "metadatas" in d and "ids" in d
isinstance(d, dict) and "documents" in d and "metadatas" in d and "ids" in d
),
"qdrant": lambda d: (
isinstance(d, dict) and
"collection_name" in d and "points" in d and "config" in d
isinstance(d, dict) and "collection_name" in d and "points" in d and "config" in d
),
}
@@ -795,8 +790,7 @@ export default {
# Validate structure
self.assertTrue(
validate_func(data),
f"{platform} validation failed: incorrect JSON structure"
validate_func(data), f"{platform} validation failed: incorrect JSON structure"
)
def test_e2e_rag_empty_skill_handling(self):
@@ -838,9 +832,7 @@ export default {
if platform == "langchain":
categories = {item["metadata"]["category"] for item in data}
elif platform == "weaviate":
categories = {
obj["properties"]["category"] for obj in data["objects"]
}
categories = {obj["properties"]["category"] for obj in data["objects"]}
elif platform == "chroma":
categories = {meta["category"] for meta in data["metadatas"]}
@@ -854,8 +846,7 @@ export default {
# Check that at least one reference category exists
ref_categories = categories - {"overview"}
self.assertGreater(
len(ref_categories), 0,
f"{platform}: Should have at least one reference category"
len(ref_categories), 0, f"{platform}: Should have at least one reference category"
)
def test_e2e_rag_integration_workflow_chromadb(self):
@@ -878,17 +869,10 @@ export default {
# Create collection and add documents
collection = client.create_collection(data["collection_name"])
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
)
collection.add(documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"])
# Query
results = collection.query(
query_texts=["reactivity"],
n_results=2
)
results = collection.query(query_texts=["reactivity"], n_results=2)
# Verify results
self.assertGreater(len(results["documents"][0]), 0, "Should return results")

View File

@@ -28,9 +28,7 @@ class TestChromaAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for Chroma format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for Chroma format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestChromaAdaptor:
# Format as Chroma collection
adaptor = get_adaptor("chroma")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
collection_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -124,7 +120,10 @@ class TestChromaAdaptor:
# Upload may fail if chromadb not installed (expected)
assert "message" in result
# Either chromadb not installed or connection error
assert ("chromadb not installed" in result["message"] or "Failed to connect" in result["message"])
assert (
"chromadb not installed" in result["message"]
or "Failed to connect" in result["message"]
)
def test_validate_api_key_returns_false(self):
"""Test that API key validation returns False (no API needed)."""
@@ -157,9 +156,7 @@ class TestChromaAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("chroma")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
collection_json = adaptor.format_skill_md(skill_dir, metadata)
collection = json.loads(collection_json)
@@ -179,9 +176,7 @@ class TestChromaAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("chroma")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
collection_json = adaptor.format_skill_md(skill_dir, metadata)
collection = json.loads(collection_json)

View File

@@ -28,9 +28,7 @@ class TestFAISSAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for FAISS format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for FAISS format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestFAISSAdaptor:
# Format as FAISS index data
adaptor = get_adaptor("faiss")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
index_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -158,9 +154,7 @@ class TestFAISSAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("faiss")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
index_json = adaptor.format_skill_md(skill_dir, metadata)
index_data = json.loads(index_json)
@@ -180,9 +174,7 @@ class TestFAISSAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("faiss")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
index_json = adaptor.format_skill_md(skill_dir, metadata)
index_data = json.loads(index_json)

View File

@@ -28,9 +28,7 @@ class TestHaystackAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for Haystack format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for Haystack format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestHaystackAdaptor:
# Format as Haystack Documents
adaptor = get_adaptor("haystack")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -112,7 +108,7 @@ class TestHaystackAdaptor:
"""Test upload returns instructions (no actual upload)."""
# Create test package
package_path = tmp_path / "test-haystack.json"
package_path.write_text('[]')
package_path.write_text("[]")
adaptor = get_adaptor("haystack")
result = adaptor.upload(package_path, "fake-key")
@@ -154,9 +150,7 @@ class TestHaystackAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("haystack")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)
@@ -174,9 +168,7 @@ class TestHaystackAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("haystack")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)

View File

@@ -28,9 +28,7 @@ class TestLangChainAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for LangChain format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for LangChain format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestLangChainAdaptor:
# Format as LangChain Documents
adaptor = get_adaptor("langchain")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -112,7 +108,7 @@ class TestLangChainAdaptor:
"""Test upload returns instructions (no actual upload)."""
# Create test package
package_path = tmp_path / "test-langchain.json"
package_path.write_text('[]')
package_path.write_text("[]")
adaptor = get_adaptor("langchain")
result = adaptor.upload(package_path, "fake-key")
@@ -153,9 +149,7 @@ class TestLangChainAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("langchain")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)
@@ -173,9 +167,7 @@ class TestLangChainAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("langchain")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)

View File

@@ -28,9 +28,7 @@ class TestLlamaIndexAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for LlamaIndex format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for LlamaIndex format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestLlamaIndexAdaptor:
# Format as LlamaIndex Documents
adaptor = get_adaptor("llama-index")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -112,7 +108,7 @@ class TestLlamaIndexAdaptor:
"""Test upload returns instructions (no actual upload)."""
# Create test package
package_path = tmp_path / "test-llama-index.json"
package_path.write_text('[]')
package_path.write_text("[]")
adaptor = get_adaptor("llama-index")
result = adaptor.upload(package_path, "fake-key")
@@ -153,9 +149,7 @@ class TestLlamaIndexAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("llama-index")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)
@@ -173,9 +167,7 @@ class TestLlamaIndexAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("llama-index")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
documents_json = adaptor.format_skill_md(skill_dir, metadata)
documents = json.loads(documents_json)

View File

@@ -28,9 +28,7 @@ class TestQdrantAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for Qdrant format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for Qdrant format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestQdrantAdaptor:
# Format as Qdrant points
adaptor = get_adaptor("qdrant")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
points_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -119,7 +115,7 @@ class TestQdrantAdaptor:
"""Test upload returns instructions (no actual upload)."""
# Create test package
package_path = tmp_path / "test-qdrant.json"
package_path.write_text('[]')
package_path.write_text("[]")
adaptor = get_adaptor("qdrant")
result = adaptor.upload(package_path, "fake-key")
@@ -160,9 +156,7 @@ class TestQdrantAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("qdrant")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
points_json = adaptor.format_skill_md(skill_dir, metadata)
result = json.loads(points_json)
@@ -181,9 +175,7 @@ class TestQdrantAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("qdrant")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
points_json = adaptor.format_skill_md(skill_dir, metadata)
result = json.loads(points_json)

View File

@@ -28,9 +28,7 @@ class TestWeaviateAdaptor:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text(
"# Test Skill\n\nThis is a test skill for Weaviate format."
)
skill_md.write_text("# Test Skill\n\nThis is a test skill for Weaviate format.")
# Create references directory with files
refs_dir = skill_dir / "references"
@@ -40,9 +38,7 @@ class TestWeaviateAdaptor:
# Format as Weaviate objects
adaptor = get_adaptor("weaviate")
metadata = SkillMetadata(
name="test_skill", description="Test skill", version="1.0.0"
)
metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0")
objects_json = adaptor.format_skill_md(skill_dir, metadata)
@@ -119,7 +115,7 @@ class TestWeaviateAdaptor:
"""Test upload returns instructions (no actual upload)."""
# Create test package
package_path = tmp_path / "test-weaviate.json"
package_path.write_text('[]')
package_path.write_text("[]")
adaptor = get_adaptor("weaviate")
result = adaptor.upload(package_path, "fake-key")
@@ -127,7 +123,11 @@ class TestWeaviateAdaptor:
# Upload may fail if weaviate not installed (expected)
assert "message" in result
# Either weaviate not installed, invalid JSON, or connection error
assert ("import weaviate" in result["message"] or "Failed to connect" in result["message"] or result["success"] is False)
assert (
"import weaviate" in result["message"]
or "Failed to connect" in result["message"]
or result["success"] is False
)
def test_validate_api_key_returns_false(self):
"""Test that API key validation returns False (no API needed)."""
@@ -160,9 +160,7 @@ class TestWeaviateAdaptor:
skill_dir.mkdir()
adaptor = get_adaptor("weaviate")
metadata = SkillMetadata(
name="empty_skill", description="Empty", version="1.0.0"
)
metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0")
objects_json = adaptor.format_skill_md(skill_dir, metadata)
result = json.loads(objects_json)
@@ -181,9 +179,7 @@ class TestWeaviateAdaptor:
(refs_dir / "test.md").write_text("# Test\n\nTest content.")
adaptor = get_adaptor("weaviate")
metadata = SkillMetadata(
name="refs_only", description="Refs only", version="1.0.0"
)
metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0")
objects_json = adaptor.format_skill_md(skill_dir, metadata)
result = json.loads(objects_json)

View File

@@ -12,7 +12,7 @@ from skill_seekers.benchmark import (
BenchmarkResult,
BenchmarkRunner,
BenchmarkReport,
Metric
Metric,
)
from skill_seekers.benchmark.models import TimingResult, MemoryUsage
@@ -37,12 +37,7 @@ class TestBenchmarkResult:
"""Test adding timing result."""
result = BenchmarkResult("test")
timing = TimingResult(
operation="test_op",
duration=1.5,
iterations=1,
avg_duration=1.5
)
timing = TimingResult(operation="test_op", duration=1.5, iterations=1, avg_duration=1.5)
result.add_timing(timing)
@@ -55,11 +50,7 @@ class TestBenchmarkResult:
result = BenchmarkResult("test")
usage = MemoryUsage(
operation="test_op",
before_mb=100.0,
after_mb=150.0,
peak_mb=160.0,
allocated_mb=50.0
operation="test_op", before_mb=100.0, after_mb=150.0, peak_mb=160.0, allocated_mb=50.0
)
result.add_memory(usage)
@@ -72,11 +63,7 @@ class TestBenchmarkResult:
"""Test adding custom metric."""
result = BenchmarkResult("test")
metric = Metric(
name="pages_per_sec",
value=12.5,
unit="pages/sec"
)
metric = Metric(name="pages_per_sec", value=12.5, unit="pages/sec")
result.add_metric(metric)
@@ -107,12 +94,7 @@ class TestBenchmarkResult:
"""Test report generation."""
result = BenchmarkResult("test")
timing = TimingResult(
operation="test_op",
duration=1.0,
iterations=1,
avg_duration=1.0
)
timing = TimingResult(operation="test_op", duration=1.0, iterations=1, avg_duration=1.0)
result.add_timing(timing)
report = result.to_report()
@@ -303,7 +285,7 @@ class TestBenchmark:
before_mb=100.0,
after_mb=1200.0,
peak_mb=1500.0,
allocated_mb=1100.0
allocated_mb=1100.0,
)
benchmark.result.add_memory(usage)
@@ -370,10 +352,7 @@ class TestBenchmarkRunner:
with bench.timer("op2"):
time.sleep(0.03)
reports = runner.run_suite({
"test1": bench1,
"test2": bench2
})
reports = runner.run_suite({"test1": bench1, "test2": bench2})
assert len(reports) == 2
assert "test1" in reports
@@ -405,6 +384,7 @@ class TestBenchmarkRunner:
# Compare
from skill_seekers.benchmark.models import ComparisonReport
comparison = runner.compare(baseline_path, improved_path)
assert isinstance(comparison, ComparisonReport)
@@ -458,6 +438,7 @@ class TestBenchmarkRunner:
def test_cleanup_old(self, tmp_path):
"""Test cleaning up old benchmarks."""
import os
runner = BenchmarkRunner(output_dir=tmp_path)
# Create 10 benchmark files with different timestamps
@@ -476,10 +457,10 @@ class TestBenchmarkRunner:
"memory": [],
"metrics": [],
"system_info": {},
"recommendations": []
"recommendations": [],
}
with open(file_path, 'w') as f:
with open(file_path, "w") as f:
json.dump(report_data, f)
# Set different modification times
@@ -505,12 +486,7 @@ class TestBenchmarkModels:
def test_timing_result_model(self):
"""Test TimingResult model."""
timing = TimingResult(
operation="test",
duration=1.5,
iterations=10,
avg_duration=0.15
)
timing = TimingResult(operation="test", duration=1.5, iterations=10, avg_duration=0.15)
assert timing.operation == "test"
assert timing.duration == 1.5
@@ -520,11 +496,7 @@ class TestBenchmarkModels:
def test_memory_usage_model(self):
"""Test MemoryUsage model."""
usage = MemoryUsage(
operation="allocate",
before_mb=100.0,
after_mb=200.0,
peak_mb=250.0,
allocated_mb=100.0
operation="allocate", before_mb=100.0, after_mb=200.0, peak_mb=250.0, allocated_mb=100.0
)
assert usage.operation == "allocate"
@@ -533,11 +505,7 @@ class TestBenchmarkModels:
def test_metric_model(self):
"""Test Metric model."""
metric = Metric(
name="throughput",
value=125.5,
unit="ops/sec"
)
metric = Metric(name="throughput", value=125.5, unit="ops/sec")
assert metric.name == "throughput"
assert metric.value == 125.5
@@ -551,26 +519,19 @@ class TestBenchmarkModels:
started_at=datetime.utcnow(),
finished_at=datetime.utcnow(),
total_duration=5.0,
timings=[
TimingResult(
operation="op1",
duration=2.0,
iterations=1,
avg_duration=2.0
)
],
timings=[TimingResult(operation="op1", duration=2.0, iterations=1, avg_duration=2.0)],
memory=[
MemoryUsage(
operation="op1",
before_mb=100.0,
after_mb=200.0,
peak_mb=250.0,
allocated_mb=100.0
allocated_mb=100.0,
)
],
metrics=[],
system_info={},
recommendations=[]
recommendations=[],
)
summary = report.summary
@@ -592,7 +553,7 @@ class TestBenchmarkModels:
memory=[],
metrics=[],
system_info={},
recommendations=[]
recommendations=[],
)
current = BenchmarkReport(
@@ -604,7 +565,7 @@ class TestBenchmarkModels:
memory=[],
metrics=[],
system_info={},
recommendations=[]
recommendations=[],
)
comparison = ComparisonReport(
@@ -614,7 +575,7 @@ class TestBenchmarkModels:
improvements=[],
regressions=["Slower performance"],
speedup_factor=0.5,
memory_change_mb=0.0
memory_change_mb=0.0,
)
assert comparison.has_regressions is True
@@ -632,7 +593,7 @@ class TestBenchmarkModels:
memory=[],
metrics=[],
system_info={},
recommendations=[]
recommendations=[],
)
current = BenchmarkReport(
@@ -644,7 +605,7 @@ class TestBenchmarkModels:
memory=[],
metrics=[],
system_info={},
recommendations=[]
recommendations=[],
)
comparison = ComparisonReport(
@@ -654,7 +615,7 @@ class TestBenchmarkModels:
improvements=[],
regressions=[],
speedup_factor=2.0,
memory_change_mb=0.0
memory_change_mb=0.0,
)
improvement = comparison.overall_improvement

View File

@@ -60,7 +60,7 @@ class TestChunkingDisabledByDefault:
"""Test that LangChain doesn't chunk by default."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
adaptor = get_adaptor('langchain')
adaptor = get_adaptor("langchain")
package_path = adaptor.package(skill_dir, tmp_path)
with open(package_path) as f:
@@ -71,8 +71,8 @@ class TestChunkingDisabledByDefault:
# No chunking metadata
for doc in data:
assert 'is_chunked' not in doc['metadata']
assert 'chunk_index' not in doc['metadata']
assert "is_chunked" not in doc["metadata"]
assert "chunk_index" not in doc["metadata"]
class TestChunkingEnabled:
@@ -82,12 +82,9 @@ class TestChunkingEnabled:
"""Test that LangChain chunks large documents when enabled."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
adaptor = get_adaptor('langchain')
adaptor = get_adaptor("langchain")
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=512
skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
)
with open(package_path) as f:
@@ -97,25 +94,22 @@ class TestChunkingEnabled:
assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs"
# Check for chunking metadata
chunked_docs = [doc for doc in data if doc['metadata'].get('is_chunked')]
chunked_docs = [doc for doc in data if doc["metadata"].get("is_chunked")]
assert len(chunked_docs) > 0, "Should have chunked documents"
# Verify chunk metadata structure
for doc in chunked_docs:
assert 'chunk_index' in doc['metadata']
assert 'total_chunks' in doc['metadata']
assert 'chunk_id' in doc['metadata']
assert "chunk_index" in doc["metadata"]
assert "total_chunks" in doc["metadata"]
assert "chunk_id" in doc["metadata"]
def test_chunking_preserves_small_docs(self, tmp_path):
"""Test that small documents are not chunked."""
skill_dir = create_test_skill(tmp_path, large_doc=False)
adaptor = get_adaptor('langchain')
adaptor = get_adaptor("langchain")
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=512
skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512
)
with open(package_path) as f:
@@ -125,7 +119,7 @@ class TestChunkingEnabled:
assert len(data) == 2, "Small docs should not be chunked"
for doc in data:
assert 'is_chunked' not in doc['metadata']
assert "is_chunked" not in doc["metadata"]
class TestCodeBlockPreservation:
@@ -158,43 +152,43 @@ More content after code block.
# Create references dir (required)
(skill_dir / "references").mkdir()
adaptor = get_adaptor('langchain')
adaptor = get_adaptor("langchain")
package_path = adaptor.package(
skill_dir,
tmp_path,
enable_chunking=True,
chunk_max_tokens=200, # Small chunks to force splitting
preserve_code_blocks=True
preserve_code_blocks=True,
)
with open(package_path) as f:
data = json.load(f)
# Find chunks with code block
code_chunks = [
doc for doc in data
if '```python' in doc['page_content']
]
code_chunks = [doc for doc in data if "```python" in doc["page_content"]]
# Code block should be in at least one chunk
assert len(code_chunks) >= 1, "Code block should be preserved"
# Code block should be complete (opening and closing backticks)
for chunk in code_chunks:
content = chunk['page_content']
if '```python' in content:
content = chunk["page_content"]
if "```python" in content:
# Should also have closing backticks
assert content.count('```') >= 2, "Code block should be complete"
assert content.count("```") >= 2, "Code block should be complete"
class TestAutoChunkingForRAGPlatforms:
"""Test that chunking is auto-enabled for RAG platforms."""
@pytest.mark.parametrize("platform", [
'langchain',
# Add others after they're updated:
# 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
])
@pytest.mark.parametrize(
"platform",
[
"langchain",
# Add others after they're updated:
# 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'
],
)
def test_rag_platforms_auto_chunk(self, platform, tmp_path):
"""Test that RAG platforms auto-enable chunking."""
skill_dir = create_test_skill(tmp_path, large_doc=True)
@@ -208,7 +202,7 @@ class TestAutoChunkingForRAGPlatforms:
open_folder_after=False,
skip_quality_check=True,
target=platform,
enable_chunking=False # Explicitly disabled, but should be auto-enabled
enable_chunking=False, # Explicitly disabled, but should be auto-enabled
)
assert success, f"Packaging failed for {platform}"
@@ -221,8 +215,8 @@ class TestAutoChunkingForRAGPlatforms:
# Should have multiple documents/chunks
if isinstance(data, list):
assert len(data) > 2, f"{platform}: Should auto-chunk large docs"
elif isinstance(data, dict) and 'documents' in data:
assert len(data['documents']) > 2, f"{platform}: Should auto-chunk large docs"
elif isinstance(data, dict) and "documents" in data:
assert len(data["documents"]) > 2, f"{platform}: Should auto-chunk large docs"
class TestBaseAdaptorChunkingHelper:
@@ -237,11 +231,7 @@ class TestBaseAdaptorChunkingHelper:
content = "Test content " * 1000 # Large content
metadata = {"source": "test"}
chunks = adaptor._maybe_chunk_content(
content,
metadata,
enable_chunking=False
)
chunks = adaptor._maybe_chunk_content(content, metadata, enable_chunking=False)
# Should return single chunk
assert len(chunks) == 1
@@ -258,10 +248,7 @@ class TestBaseAdaptorChunkingHelper:
metadata = {"source": "test"}
chunks = adaptor._maybe_chunk_content(
content,
metadata,
enable_chunking=True,
chunk_max_tokens=512
content, metadata, enable_chunking=True, chunk_max_tokens=512
)
# Should return single chunk
@@ -282,7 +269,7 @@ class TestBaseAdaptorChunkingHelper:
enable_chunking=True,
chunk_max_tokens=512,
preserve_code_blocks=True,
source_file="test.md"
source_file="test.md",
)
# Should return multiple chunks
@@ -292,12 +279,12 @@ class TestBaseAdaptorChunkingHelper:
for chunk_text, chunk_meta in chunks:
assert isinstance(chunk_text, str)
assert isinstance(chunk_meta, dict)
assert chunk_meta['is_chunked']
assert 'chunk_index' in chunk_meta
assert 'chunk_id' in chunk_meta
assert chunk_meta["is_chunked"]
assert "chunk_index" in chunk_meta
assert "chunk_id" in chunk_meta
# Original metadata preserved
assert chunk_meta['source'] == 'test'
assert chunk_meta['file'] == 'test.md'
assert chunk_meta["source"] == "test"
assert chunk_meta["file"] == "test.md"
class TestChunkingCLIIntegration:
@@ -313,10 +300,10 @@ class TestChunkingCLIIntegration:
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
target="langchain",
enable_chunking=True, # --chunk flag
chunk_max_tokens=512,
preserve_code_blocks=True
preserve_code_blocks=True,
)
assert success
@@ -339,10 +326,10 @@ class TestChunkingCLIIntegration:
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
target="langchain",
enable_chunking=True,
chunk_max_tokens=256, # Small chunks
preserve_code_blocks=True
preserve_code_blocks=True,
)
assert success
@@ -355,10 +342,10 @@ class TestChunkingCLIIntegration:
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target='langchain',
target="langchain",
enable_chunking=True,
chunk_max_tokens=1024, # Large chunks
preserve_code_blocks=True
preserve_code_blocks=True,
)
assert success
@@ -367,9 +354,10 @@ class TestChunkingCLIIntegration:
data_large = json.load(f)
# Small chunk size should produce more chunks
assert len(data_small) > len(data_large), \
assert len(data_small) > len(data_large), (
f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
)
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -30,12 +30,12 @@ class TestParserRegistry:
"""Test getting list of parser names."""
names = get_parser_names()
assert len(names) == 19
assert 'scrape' in names
assert 'github' in names
assert 'package' in names
assert 'upload' in names
assert 'analyze' in names
assert 'config' in names
assert "scrape" in names
assert "github" in names
assert "package" in names
assert "upload" in names
assert "analyze" in names
assert "config" in names
def test_all_parsers_are_subcommand_parsers(self):
"""Test that all parsers inherit from SubcommandParser."""
@@ -45,9 +45,9 @@ class TestParserRegistry:
def test_all_parsers_have_required_properties(self):
"""Test that all parsers have name, help, description."""
for parser in PARSERS:
assert hasattr(parser, 'name')
assert hasattr(parser, 'help')
assert hasattr(parser, 'description')
assert hasattr(parser, "name")
assert hasattr(parser, "help")
assert hasattr(parser, "description")
assert isinstance(parser.name, str)
assert isinstance(parser.help, str)
assert isinstance(parser.description, str)
@@ -57,7 +57,7 @@ class TestParserRegistry:
def test_all_parsers_have_add_arguments_method(self):
"""Test that all parsers implement add_arguments."""
for parser in PARSERS:
assert hasattr(parser, 'add_arguments')
assert hasattr(parser, "add_arguments")
assert callable(parser.add_arguments)
def test_no_duplicate_parser_names(self):
@@ -106,21 +106,21 @@ class TestParserCreation:
def test_register_parsers_creates_all_subcommands(self):
"""Test that register_parsers creates all 19 subcommands."""
main_parser = argparse.ArgumentParser()
subparsers = main_parser.add_subparsers(dest='command')
subparsers = main_parser.add_subparsers(dest="command")
# Register all parsers
register_parsers(subparsers)
# Test that all commands can be parsed
test_commands = [
'config --show',
'scrape --config test.json',
'github --repo owner/repo',
'package output/test/',
'upload test.zip',
'analyze --directory .',
'enhance output/test/',
'estimate test.json',
"config --show",
"scrape --config test.json",
"github --repo owner/repo",
"package output/test/",
"upload test.zip",
"analyze --directory .",
"enhance output/test/",
"estimate test.json",
]
for cmd in test_commands:
@@ -134,75 +134,76 @@ class TestSpecificParsers:
def test_scrape_parser_arguments(self):
"""Test ScrapeParser has correct arguments."""
main_parser = argparse.ArgumentParser()
subparsers = main_parser.add_subparsers(dest='command')
subparsers = main_parser.add_subparsers(dest="command")
scrape_parser = ScrapeParser()
scrape_parser.create_parser(subparsers)
# Test various argument combinations
args = main_parser.parse_args(['scrape', '--config', 'test.json'])
assert args.command == 'scrape'
assert args.config == 'test.json'
args = main_parser.parse_args(["scrape", "--config", "test.json"])
assert args.command == "scrape"
assert args.config == "test.json"
args = main_parser.parse_args(['scrape', '--config', 'test.json', '--max-pages', '100'])
args = main_parser.parse_args(["scrape", "--config", "test.json", "--max-pages", "100"])
assert args.max_pages == 100
args = main_parser.parse_args(['scrape', '--enhance'])
args = main_parser.parse_args(["scrape", "--enhance"])
assert args.enhance is True
def test_github_parser_arguments(self):
"""Test GitHubParser has correct arguments."""
main_parser = argparse.ArgumentParser()
subparsers = main_parser.add_subparsers(dest='command')
subparsers = main_parser.add_subparsers(dest="command")
github_parser = GitHubParser()
github_parser.create_parser(subparsers)
args = main_parser.parse_args(['github', '--repo', 'owner/repo'])
assert args.command == 'github'
assert args.repo == 'owner/repo'
args = main_parser.parse_args(["github", "--repo", "owner/repo"])
assert args.command == "github"
assert args.repo == "owner/repo"
args = main_parser.parse_args(['github', '--repo', 'owner/repo', '--non-interactive'])
args = main_parser.parse_args(["github", "--repo", "owner/repo", "--non-interactive"])
assert args.non_interactive is True
def test_package_parser_arguments(self):
"""Test PackageParser has correct arguments."""
main_parser = argparse.ArgumentParser()
subparsers = main_parser.add_subparsers(dest='command')
subparsers = main_parser.add_subparsers(dest="command")
package_parser = PackageParser()
package_parser.create_parser(subparsers)
args = main_parser.parse_args(['package', 'output/test/'])
assert args.command == 'package'
assert args.skill_directory == 'output/test/'
args = main_parser.parse_args(["package", "output/test/"])
assert args.command == "package"
assert args.skill_directory == "output/test/"
args = main_parser.parse_args(['package', 'output/test/', '--target', 'gemini'])
assert args.target == 'gemini'
args = main_parser.parse_args(["package", "output/test/", "--target", "gemini"])
assert args.target == "gemini"
args = main_parser.parse_args(['package', 'output/test/', '--no-open'])
args = main_parser.parse_args(["package", "output/test/", "--no-open"])
assert args.no_open is True
def test_analyze_parser_arguments(self):
"""Test AnalyzeParser has correct arguments."""
main_parser = argparse.ArgumentParser()
subparsers = main_parser.add_subparsers(dest='command')
subparsers = main_parser.add_subparsers(dest="command")
from skill_seekers.cli.parsers.analyze_parser import AnalyzeParser
analyze_parser = AnalyzeParser()
analyze_parser.create_parser(subparsers)
args = main_parser.parse_args(['analyze', '--directory', '.'])
assert args.command == 'analyze'
assert args.directory == '.'
args = main_parser.parse_args(["analyze", "--directory", "."])
assert args.command == "analyze"
assert args.directory == "."
args = main_parser.parse_args(['analyze', '--directory', '.', '--quick'])
args = main_parser.parse_args(["analyze", "--directory", ".", "--quick"])
assert args.quick is True
args = main_parser.parse_args(['analyze', '--directory', '.', '--comprehensive'])
args = main_parser.parse_args(["analyze", "--directory", ".", "--comprehensive"])
assert args.comprehensive is True
args = main_parser.parse_args(['analyze', '--directory', '.', '--skip-patterns'])
args = main_parser.parse_args(["analyze", "--directory", ".", "--skip-patterns"])
assert args.skip_patterns is True
@@ -215,11 +216,25 @@ class TestBackwardCompatibility:
# Original commands from old main.py
original_commands = [
'config', 'scrape', 'github', 'pdf', 'unified',
'enhance', 'enhance-status', 'package', 'upload',
'estimate', 'extract-test-examples', 'install-agent',
'analyze', 'install', 'resume', 'stream',
'update', 'multilang', 'quality'
"config",
"scrape",
"github",
"pdf",
"unified",
"enhance",
"enhance-status",
"package",
"upload",
"estimate",
"extract-test-examples",
"install-agent",
"analyze",
"install",
"resume",
"stream",
"update",
"multilang",
"quality",
]
for cmd in original_commands:

View File

@@ -20,18 +20,21 @@ from skill_seekers.cli.storage import (
# Check if cloud storage dependencies are available
try:
import boto3 # noqa: F401
BOTO3_AVAILABLE = True
except ImportError:
BOTO3_AVAILABLE = False
try:
from google.cloud import storage # noqa: F401
GCS_AVAILABLE = True
except ImportError:
GCS_AVAILABLE = False
try:
from azure.storage.blob import BlobServiceClient # noqa: F401
AZURE_AVAILABLE = True
except ImportError:
AZURE_AVAILABLE = False
@@ -41,12 +44,13 @@ except ImportError:
# Factory Tests
# ========================================
def test_get_storage_adaptor_s3():
"""Test S3 adaptor factory."""
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3'):
adaptor = get_storage_adaptor('s3', bucket='test-bucket')
with patch("skill_seekers.cli.storage.s3_storage.boto3"):
adaptor = get_storage_adaptor("s3", bucket="test-bucket")
assert isinstance(adaptor, S3StorageAdaptor)
@@ -54,8 +58,8 @@ def test_get_storage_adaptor_gcs():
"""Test GCS adaptor factory."""
if not GCS_AVAILABLE:
pytest.skip("google-cloud-storage not installed")
with patch('skill_seekers.cli.storage.gcs_storage.storage'):
adaptor = get_storage_adaptor('gcs', bucket='test-bucket')
with patch("skill_seekers.cli.storage.gcs_storage.storage"):
adaptor = get_storage_adaptor("gcs", bucket="test-bucket")
assert isinstance(adaptor, GCSStorageAdaptor)
@@ -63,11 +67,11 @@ def test_get_storage_adaptor_azure():
"""Test Azure adaptor factory."""
if not AZURE_AVAILABLE:
pytest.skip("azure-storage-blob not installed")
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient'):
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient"):
adaptor = get_storage_adaptor(
'azure',
container='test-container',
connection_string='DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
"azure",
container="test-container",
connection_string="DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key",
)
assert isinstance(adaptor, AzureStorageAdaptor)
@@ -75,36 +79,37 @@ def test_get_storage_adaptor_azure():
def test_get_storage_adaptor_invalid_provider():
"""Test invalid provider raises error."""
with pytest.raises(ValueError, match="Unsupported storage provider"):
get_storage_adaptor('invalid', bucket='test')
get_storage_adaptor("invalid", bucket="test")
# ========================================
# S3 Storage Tests
# ========================================
def test_s3_upload_file():
"""Test S3 file upload."""
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(b'test content')
tmp_file.write(b"test content")
tmp_path = tmp_file.name
try:
# Test upload
result = adaptor.upload_file(tmp_path, 'test.txt')
result = adaptor.upload_file(tmp_path, "test.txt")
assert result == 's3://test-bucket/test.txt'
assert result == "s3://test-bucket/test.txt"
mock_client.upload_file.assert_called_once()
finally:
Path(tmp_path).unlink()
@@ -115,23 +120,21 @@ def test_s3_download_file():
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
with tempfile.TemporaryDirectory() as tmp_dir:
local_path = os.path.join(tmp_dir, 'downloaded.txt')
local_path = os.path.join(tmp_dir, "downloaded.txt")
# Test download
adaptor.download_file('test.txt', local_path)
adaptor.download_file("test.txt", local_path)
mock_client.download_file.assert_called_once_with(
'test-bucket', 'test.txt', local_path
)
mock_client.download_file.assert_called_once_with("test-bucket", "test.txt", local_path)
def test_s3_list_files():
@@ -139,18 +142,18 @@ def test_s3_list_files():
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_paginator = Mock()
mock_page_iterator = [
{
'Contents': [
"Contents": [
{
'Key': 'file1.txt',
'Size': 100,
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
'ETag': '"abc123"'
"Key": "file1.txt",
"Size": 100,
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
"ETag": '"abc123"',
}
]
}
@@ -161,15 +164,15 @@ def test_s3_list_files():
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
# Test list
files = adaptor.list_files('prefix/')
files = adaptor.list_files("prefix/")
assert len(files) == 1
assert files[0].key == 'file1.txt'
assert files[0].key == "file1.txt"
assert files[0].size == 100
assert files[0].etag == 'abc123'
assert files[0].etag == "abc123"
def test_s3_file_exists():
@@ -177,17 +180,17 @@ def test_s3_file_exists():
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_client.head_object.return_value = {}
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
# Test exists
assert adaptor.file_exists('test.txt') is True
assert adaptor.file_exists("test.txt") is True
def test_s3_get_file_url():
@@ -195,19 +198,19 @@ def test_s3_get_file_url():
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_client.generate_presigned_url.return_value = 'https://s3.amazonaws.com/signed-url'
mock_client.generate_presigned_url.return_value = "https://s3.amazonaws.com/signed-url"
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
# Test URL generation
url = adaptor.get_file_url('test.txt', expires_in=7200)
url = adaptor.get_file_url("test.txt", expires_in=7200)
assert url == 'https://s3.amazonaws.com/signed-url'
assert url == "https://s3.amazonaws.com/signed-url"
mock_client.generate_presigned_url.assert_called_once()
@@ -215,12 +218,13 @@ def test_s3_get_file_url():
# GCS Storage Tests
# ========================================
def test_gcs_upload_file():
"""Test GCS file upload."""
if not GCS_AVAILABLE:
pytest.skip("google-cloud-storage not installed")
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
# Setup mocks
mock_client = Mock()
mock_bucket = Mock()
@@ -230,18 +234,18 @@ def test_gcs_upload_file():
mock_bucket.blob.return_value = mock_blob
mock_storage.Client.return_value = mock_client
adaptor = GCSStorageAdaptor(bucket='test-bucket')
adaptor = GCSStorageAdaptor(bucket="test-bucket")
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(b'test content')
tmp_file.write(b"test content")
tmp_path = tmp_file.name
try:
# Test upload
result = adaptor.upload_file(tmp_path, 'test.txt')
result = adaptor.upload_file(tmp_path, "test.txt")
assert result == 'gs://test-bucket/test.txt'
assert result == "gs://test-bucket/test.txt"
mock_blob.upload_from_filename.assert_called_once()
finally:
Path(tmp_path).unlink()
@@ -252,7 +256,7 @@ def test_gcs_download_file():
if not GCS_AVAILABLE:
pytest.skip("google-cloud-storage not installed")
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
# Setup mocks
mock_client = Mock()
mock_bucket = Mock()
@@ -262,13 +266,13 @@ def test_gcs_download_file():
mock_bucket.blob.return_value = mock_blob
mock_storage.Client.return_value = mock_client
adaptor = GCSStorageAdaptor(bucket='test-bucket')
adaptor = GCSStorageAdaptor(bucket="test-bucket")
with tempfile.TemporaryDirectory() as tmp_dir:
local_path = os.path.join(tmp_dir, 'downloaded.txt')
local_path = os.path.join(tmp_dir, "downloaded.txt")
# Test download
adaptor.download_file('test.txt', local_path)
adaptor.download_file("test.txt", local_path)
mock_blob.download_to_filename.assert_called_once()
@@ -278,27 +282,27 @@ def test_gcs_list_files():
if not GCS_AVAILABLE:
pytest.skip("google-cloud-storage not installed")
with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage:
with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage:
# Setup mocks
mock_client = Mock()
mock_blob = Mock()
mock_blob.name = 'file1.txt'
mock_blob.name = "file1.txt"
mock_blob.size = 100
mock_blob.updated = Mock(isoformat=lambda: '2024-01-01T00:00:00')
mock_blob.etag = 'abc123'
mock_blob.updated = Mock(isoformat=lambda: "2024-01-01T00:00:00")
mock_blob.etag = "abc123"
mock_blob.metadata = {}
mock_client.list_blobs.return_value = [mock_blob]
mock_storage.Client.return_value = mock_client
mock_client.bucket.return_value = Mock()
adaptor = GCSStorageAdaptor(bucket='test-bucket')
adaptor = GCSStorageAdaptor(bucket="test-bucket")
# Test list
files = adaptor.list_files('prefix/')
files = adaptor.list_files("prefix/")
assert len(files) == 1
assert files[0].key == 'file1.txt'
assert files[0].key == "file1.txt"
assert files[0].size == 100
@@ -306,12 +310,13 @@ def test_gcs_list_files():
# Azure Storage Tests
# ========================================
def test_azure_upload_file():
"""Test Azure file upload."""
if not AZURE_AVAILABLE:
pytest.skip("azure-storage-blob not installed")
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
# Setup mocks
mock_service_client = Mock()
mock_container_client = Mock()
@@ -321,19 +326,21 @@ def test_azure_upload_file():
mock_container_client.get_blob_client.return_value = mock_blob_client
mock_blob_service.from_connection_string.return_value = mock_service_client
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
adaptor = AzureStorageAdaptor(
container="test-container", connection_string=connection_string
)
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(b'test content')
tmp_file.write(b"test content")
tmp_path = tmp_file.name
try:
# Test upload
result = adaptor.upload_file(tmp_path, 'test.txt')
result = adaptor.upload_file(tmp_path, "test.txt")
assert 'test.blob.core.windows.net' in result
assert "test.blob.core.windows.net" in result
mock_blob_client.upload_blob.assert_called_once()
finally:
Path(tmp_path).unlink()
@@ -344,30 +351,32 @@ def test_azure_download_file():
if not AZURE_AVAILABLE:
pytest.skip("azure-storage-blob not installed")
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
# Setup mocks
mock_service_client = Mock()
mock_container_client = Mock()
mock_blob_client = Mock()
mock_download_stream = Mock()
mock_download_stream.readall.return_value = b'test content'
mock_download_stream.readall.return_value = b"test content"
mock_service_client.get_container_client.return_value = mock_container_client
mock_container_client.get_blob_client.return_value = mock_blob_client
mock_blob_client.download_blob.return_value = mock_download_stream
mock_blob_service.from_connection_string.return_value = mock_service_client
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
adaptor = AzureStorageAdaptor(
container="test-container", connection_string=connection_string
)
with tempfile.TemporaryDirectory() as tmp_dir:
local_path = os.path.join(tmp_dir, 'downloaded.txt')
local_path = os.path.join(tmp_dir, "downloaded.txt")
# Test download
adaptor.download_file('test.txt', local_path)
adaptor.download_file("test.txt", local_path)
assert Path(local_path).exists()
assert Path(local_path).read_bytes() == b'test content'
assert Path(local_path).read_bytes() == b"test content"
def test_azure_list_files():
@@ -375,29 +384,31 @@ def test_azure_list_files():
if not AZURE_AVAILABLE:
pytest.skip("azure-storage-blob not installed")
with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service:
with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service:
# Setup mocks
mock_service_client = Mock()
mock_container_client = Mock()
mock_blob = Mock()
mock_blob.name = 'file1.txt'
mock_blob.name = "file1.txt"
mock_blob.size = 100
mock_blob.last_modified = Mock(isoformat=lambda: '2024-01-01T00:00:00')
mock_blob.etag = 'abc123'
mock_blob.last_modified = Mock(isoformat=lambda: "2024-01-01T00:00:00")
mock_blob.etag = "abc123"
mock_blob.metadata = {}
mock_container_client.list_blobs.return_value = [mock_blob]
mock_service_client.get_container_client.return_value = mock_container_client
mock_blob_service.from_connection_string.return_value = mock_service_client
connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key'
adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string)
connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key"
adaptor = AzureStorageAdaptor(
container="test-container", connection_string=connection_string
)
# Test list
files = adaptor.list_files('prefix/')
files = adaptor.list_files("prefix/")
assert len(files) == 1
assert files[0].key == 'file1.txt'
assert files[0].key == "file1.txt"
assert files[0].size == 100
@@ -405,53 +416,55 @@ def test_azure_list_files():
# Base Adaptor Tests
# ========================================
def test_storage_object():
"""Test StorageObject dataclass."""
obj = StorageObject(
key='test.txt',
key="test.txt",
size=100,
last_modified='2024-01-01T00:00:00',
etag='abc123',
metadata={'key': 'value'}
last_modified="2024-01-01T00:00:00",
etag="abc123",
metadata={"key": "value"},
)
assert obj.key == 'test.txt'
assert obj.key == "test.txt"
assert obj.size == 100
assert obj.metadata == {'key': 'value'}
assert obj.metadata == {"key": "value"}
def test_base_adaptor_abstract():
"""Test that BaseStorageAdaptor cannot be instantiated."""
with pytest.raises(TypeError):
BaseStorageAdaptor(bucket='test')
BaseStorageAdaptor(bucket="test")
# ========================================
# Integration-style Tests
# ========================================
def test_upload_directory():
"""Test directory upload."""
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
# Create temporary directory with files
with tempfile.TemporaryDirectory() as tmp_dir:
(Path(tmp_dir) / 'file1.txt').write_text('content1')
(Path(tmp_dir) / 'file2.txt').write_text('content2')
(Path(tmp_dir) / 'subdir').mkdir()
(Path(tmp_dir) / 'subdir' / 'file3.txt').write_text('content3')
(Path(tmp_dir) / "file1.txt").write_text("content1")
(Path(tmp_dir) / "file2.txt").write_text("content2")
(Path(tmp_dir) / "subdir").mkdir()
(Path(tmp_dir) / "subdir" / "file3.txt").write_text("content3")
# Test upload directory
uploaded_files = adaptor.upload_directory(tmp_dir, 'skills/')
uploaded_files = adaptor.upload_directory(tmp_dir, "skills/")
assert len(uploaded_files) == 3
assert mock_client.upload_file.call_count == 3
@@ -462,25 +475,25 @@ def test_download_directory():
if not BOTO3_AVAILABLE:
pytest.skip("boto3 not installed")
with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3:
with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3:
# Setup mocks
mock_client = Mock()
mock_paginator = Mock()
mock_page_iterator = [
{
'Contents': [
"Contents": [
{
'Key': 'skills/file1.txt',
'Size': 100,
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
'ETag': '"abc"'
"Key": "skills/file1.txt",
"Size": 100,
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
"ETag": '"abc"',
},
{
'Key': 'skills/file2.txt',
'Size': 200,
'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'),
'ETag': '"def"'
}
"Key": "skills/file2.txt",
"Size": 200,
"LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"),
"ETag": '"def"',
},
]
}
]
@@ -490,11 +503,11 @@ def test_download_directory():
mock_boto3.client.return_value = mock_client
mock_boto3.resource.return_value = Mock()
adaptor = S3StorageAdaptor(bucket='test-bucket')
adaptor = S3StorageAdaptor(bucket="test-bucket")
with tempfile.TemporaryDirectory() as tmp_dir:
# Test download directory
downloaded_files = adaptor.download_directory('skills/', tmp_dir)
downloaded_files = adaptor.download_directory("skills/", tmp_dir)
assert len(downloaded_files) == 2
assert mock_client.download_file.call_count == 2

View File

@@ -23,6 +23,7 @@ from skill_seekers.embedding.cache import EmbeddingCache
# Cache Tests
# ========================================
def test_cache_init():
"""Test cache initialization."""
cache = EmbeddingCache(":memory:")
@@ -121,6 +122,7 @@ def test_cache_context_manager():
# Generator Tests
# ========================================
def test_generator_init():
"""Test generator initialization."""
generator = EmbeddingGenerator()
@@ -174,7 +176,7 @@ def test_generator_compute_hash():
assert hash1 != hash4
@patch('skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE', False)
@patch("skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE", False)
def test_generator_sentence_transformers_not_available():
"""Test sentence-transformers not available."""
generator = EmbeddingGenerator()
@@ -183,7 +185,7 @@ def test_generator_sentence_transformers_not_available():
generator.generate("test", model="all-MiniLM-L6-v2")
@patch('skill_seekers.embedding.generator.OPENAI_AVAILABLE', False)
@patch("skill_seekers.embedding.generator.OPENAI_AVAILABLE", False)
def test_generator_openai_not_available():
"""Test OpenAI not available."""
generator = EmbeddingGenerator()
@@ -192,7 +194,7 @@ def test_generator_openai_not_available():
generator.generate("test", model="text-embedding-3-small")
@patch('skill_seekers.embedding.generator.VOYAGE_AVAILABLE', False)
@patch("skill_seekers.embedding.generator.VOYAGE_AVAILABLE", False)
def test_generator_voyage_not_available():
"""Test Voyage AI not available."""
generator = EmbeddingGenerator()
@@ -227,13 +229,10 @@ def test_generator_voyage_large_2_model_info():
# Model Tests
# ========================================
def test_embedding_request():
"""Test EmbeddingRequest model."""
request = EmbeddingRequest(
text="Hello world",
model="text-embedding-3-small",
normalize=True
)
request = EmbeddingRequest(text="Hello world", model="text-embedding-3-small", normalize=True)
assert request.text == "Hello world"
assert request.model == "text-embedding-3-small"
@@ -243,9 +242,7 @@ def test_embedding_request():
def test_batch_embedding_request():
"""Test BatchEmbeddingRequest model."""
request = BatchEmbeddingRequest(
texts=["text1", "text2", "text3"],
model="text-embedding-3-small",
batch_size=32
texts=["text1", "text2", "text3"], model="text-embedding-3-small", batch_size=32
)
assert len(request.texts) == 3
@@ -255,10 +252,7 @@ def test_batch_embedding_request():
def test_embedding_response():
"""Test EmbeddingResponse model."""
response = EmbeddingResponse(
embedding=[0.1, 0.2, 0.3],
model="test-model",
dimensions=3,
cached=False
embedding=[0.1, 0.2, 0.3], model="test-model", dimensions=3, cached=False
)
assert len(response.embedding) == 3
@@ -273,7 +267,7 @@ def test_batch_embedding_response():
model="test-model",
dimensions=2,
count=2,
cached_count=1
cached_count=1,
)
assert len(response.embeddings) == 2
@@ -288,7 +282,7 @@ def test_health_response():
version="1.0.0",
models=["model1", "model2"],
cache_enabled=True,
cache_size=100
cache_size=100,
)
assert response.status == "ok"
@@ -303,7 +297,7 @@ def test_model_info():
provider="openai",
dimensions=1536,
max_tokens=8191,
cost_per_million=0.02
cost_per_million=0.02,
)
assert info.name == "test-model"
@@ -315,6 +309,7 @@ def test_model_info():
# Integration Tests
# ========================================
def test_cache_batch_operations():
"""Test cache batch operations."""
cache = EmbeddingCache(":memory:")

View File

@@ -23,7 +23,7 @@ from skill_seekers.cli.embedding_pipeline import (
EmbeddingPipeline,
LocalEmbeddingProvider,
EmbeddingCache,
CostTracker
CostTracker,
)
@@ -112,21 +112,16 @@ def test_cost_tracker():
stats = tracker.get_stats()
assert stats['total_requests'] == 2
assert stats['total_tokens'] == 1500
assert stats['cache_hits'] == 1
assert stats['cache_misses'] == 1
assert '50.0%' in stats['cache_rate']
assert stats["total_requests"] == 2
assert stats["total_tokens"] == 1500
assert stats["cache_hits"] == 1
assert stats["cache_misses"] == 1
assert "50.0%" in stats["cache_rate"]
def test_pipeline_initialization():
"""Test pipeline initialization."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=128,
batch_size=10
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=128, batch_size=10)
pipeline = EmbeddingPipeline(config)
@@ -137,12 +132,7 @@ def test_pipeline_initialization():
def test_pipeline_generate_batch():
"""Test batch embedding generation."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=64,
batch_size=2
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=64, batch_size=2)
pipeline = EmbeddingPipeline(config)
@@ -159,11 +149,11 @@ def test_pipeline_caching():
"""Test pipeline uses caching."""
with tempfile.TemporaryDirectory() as tmpdir:
config = EmbeddingConfig(
provider='local',
model='test-model',
provider="local",
model="test-model",
dimension=32,
batch_size=10,
cache_dir=Path(tmpdir)
cache_dir=Path(tmpdir),
)
pipeline = EmbeddingPipeline(config)
@@ -184,10 +174,10 @@ def test_pipeline_caching():
def test_pipeline_batch_processing():
"""Test large batch is processed in chunks."""
config = EmbeddingConfig(
provider='local',
model='test-model',
provider="local",
model="test-model",
dimension=16,
batch_size=3 # Small batch size
batch_size=3, # Small batch size
)
pipeline = EmbeddingPipeline(config)
@@ -201,11 +191,7 @@ def test_pipeline_batch_processing():
def test_validate_dimensions_valid():
"""Test dimension validation with valid embeddings."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=128
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
pipeline = EmbeddingPipeline(config)
@@ -217,11 +203,7 @@ def test_validate_dimensions_valid():
def test_validate_dimensions_invalid():
"""Test dimension validation with invalid embeddings."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=128
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
pipeline = EmbeddingPipeline(config)
@@ -234,30 +216,22 @@ def test_validate_dimensions_invalid():
def test_embedding_result_metadata():
"""Test embedding result includes metadata."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=256
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=256)
pipeline = EmbeddingPipeline(config)
texts = ["test"]
result = pipeline.generate_batch(texts, show_progress=False)
assert 'provider' in result.metadata
assert 'model' in result.metadata
assert 'dimension' in result.metadata
assert result.metadata['dimension'] == 256
assert "provider" in result.metadata
assert "model" in result.metadata
assert "dimension" in result.metadata
assert result.metadata["dimension"] == 256
def test_cost_stats():
"""Test cost statistics tracking."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=64
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=64)
pipeline = EmbeddingPipeline(config)
@@ -266,18 +240,14 @@ def test_cost_stats():
stats = pipeline.get_cost_stats()
assert 'total_requests' in stats
assert 'cache_hits' in stats
assert 'estimated_cost' in stats
assert "total_requests" in stats
assert "cache_hits" in stats
assert "estimated_cost" in stats
def test_empty_batch():
"""Test handling empty batch."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=32
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=32)
pipeline = EmbeddingPipeline(config)
@@ -289,11 +259,7 @@ def test_empty_batch():
def test_single_document():
"""Test single document generation."""
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=128
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=128)
pipeline = EmbeddingPipeline(config)
@@ -306,11 +272,7 @@ def test_single_document():
def test_different_dimensions():
"""Test different embedding dimensions."""
for dim in [64, 128, 256, 512]:
config = EmbeddingConfig(
provider='local',
model='test-model',
dimension=dim
)
config = EmbeddingConfig(provider="local", model="test-model", dimension=dim)
pipeline = EmbeddingPipeline(config)
result = pipeline.generate_batch(["test"], show_progress=False)

View File

@@ -152,9 +152,7 @@ class TestMultiAgentSupport:
def test_rejects_missing_executable(self, tmp_path, monkeypatch):
"""Test rejection when executable is not found on PATH."""
monkeypatch.setattr(
"skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None
)
monkeypatch.setattr("skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None)
skill_dir = _make_skill_dir(tmp_path)
with pytest.raises(ValueError, match="not found in PATH"):

View File

@@ -80,8 +80,9 @@ class TestFrameworkDetection(unittest.TestCase):
arch_data = json.load(f)
self.assertIn("frameworks_detected", arch_data)
self.assertIn("Flask", arch_data["frameworks_detected"],
"Flask should be detected from imports")
self.assertIn(
"Flask", arch_data["frameworks_detected"], "Flask should be detected from imports"
)
def test_files_with_imports_are_included(self):
"""Test that files with only imports are included in analysis (Issue #239)."""
@@ -119,24 +120,19 @@ class TestFrameworkDetection(unittest.TestCase):
analysis_data = json.load(f)
# File should be included
self.assertGreater(len(analysis_data["files"]), 0,
"Files with imports should be included")
self.assertGreater(len(analysis_data["files"]), 0, "Files with imports should be included")
# Find our import-only file
import_file = next(
(f for f in analysis_data["files"] if "imports_only.py" in f["file"]),
None
(f for f in analysis_data["files"] if "imports_only.py" in f["file"]), None
)
self.assertIsNotNone(import_file, "Import-only file should be in analysis")
# Verify imports were extracted
self.assertIn("imports", import_file, "Imports should be extracted")
self.assertGreater(len(import_file["imports"]), 0,
"Should have captured imports")
self.assertIn("django", import_file["imports"],
"Django import should be captured")
self.assertIn("flask", import_file["imports"],
"Flask import should be captured")
self.assertGreater(len(import_file["imports"]), 0, "Should have captured imports")
self.assertIn("django", import_file["imports"], "Django import should be captured")
self.assertIn("flask", import_file["imports"], "Flask import should be captured")
def test_no_false_positive_frameworks(self):
"""Test that framework detection doesn't produce false positives (Issue #239)."""
@@ -145,10 +141,7 @@ class TestFrameworkDetection(unittest.TestCase):
app_dir.mkdir()
# File with no framework imports
(app_dir / "utils.py").write_text(
"def my_function():\n"
" return 'hello'\n"
)
(app_dir / "utils.py").write_text("def my_function():\n return 'hello'\n")
# Run codebase analyzer
from skill_seekers.cli.codebase_scraper import main as scraper_main
@@ -180,12 +173,10 @@ class TestFrameworkDetection(unittest.TestCase):
frameworks = arch_data.get("frameworks_detected", [])
# Should not detect Flask just from "app" directory name
self.assertNotIn("Flask", frameworks,
"Should not detect Flask without imports")
self.assertNotIn("Flask", frameworks, "Should not detect Flask without imports")
# Should not detect other frameworks with "app" in markers
for fw in ["ASP.NET", "Rails", "Laravel"]:
self.assertNotIn(fw, frameworks,
f"Should not detect {fw} without real evidence")
self.assertNotIn(fw, frameworks, f"Should not detect {fw} without real evidence")
if __name__ == "__main__":

View File

@@ -20,9 +20,7 @@ import time
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from skill_seekers.cli.incremental_updater import (
IncrementalUpdater
)
from skill_seekers.cli.incremental_updater import IncrementalUpdater
@pytest.fixture
@@ -281,15 +279,15 @@ def test_apply_update_package(temp_skill_dir):
"timestamp": "2026-02-05T12:00:00",
"skill_name": "test_skill",
"change_summary": {"modified": 1},
"total_changes": 1
"total_changes": 1,
},
"changes": {
"SKILL.md": {
"action": "modify",
"version": 2,
"content": "# Updated Content\n\nApplied from package"
"content": "# Updated Content\n\nApplied from package",
}
}
},
}
package_path.write_text(json.dumps(update_data))
@@ -298,7 +296,9 @@ def test_apply_update_package(temp_skill_dir):
success = updater.apply_update_package(package_path)
assert success
assert (temp_skill_dir / "SKILL.md").read_text() == "# Updated Content\n\nApplied from package"
assert (
temp_skill_dir / "SKILL.md"
).read_text() == "# Updated Content\n\nApplied from package"
def test_content_hash_consistency(temp_skill_dir):

View File

@@ -92,7 +92,11 @@ class TestConfigLoading(unittest.TestCase):
{
"type": "documentation",
"base_url": "https://example.com/",
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"selectors": {
"main_content": "article",
"title": "h1",
"code_blocks": "pre code",
},
"rate_limit": 0.5,
"max_pages": 100,
}

View File

@@ -113,6 +113,7 @@ def check_service_available(url: str, timeout: int = 5) -> bool:
"""Check if a service is available."""
try:
import requests
response = requests.get(url, timeout=timeout)
return response.status_code == 200
except Exception:
@@ -133,7 +134,9 @@ class TestWeaviateIntegration:
# Check if Weaviate is running
if not check_service_available("http://localhost:8080/v1/.well-known/ready"):
pytest.skip("Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
pytest.skip(
"Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
)
# Connect to Weaviate
try:
@@ -144,10 +147,7 @@ class TestWeaviateIntegration:
# Package skill
adaptor = get_adaptor("weaviate")
SkillMetadata(
name="integration_test",
description="Integration test skill for Weaviate"
)
SkillMetadata(name="integration_test", description="Integration test skill for Weaviate")
package_path = adaptor.package(sample_skill_dir, tmp_path)
assert package_path.exists(), "Package not created"
@@ -173,19 +173,16 @@ class TestWeaviateIntegration:
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(
data_object=obj["properties"],
class_name=class_name,
uuid=obj["id"]
data_object=obj["properties"], class_name=class_name, uuid=obj["id"]
)
# Wait for indexing
time.sleep(1)
# Query - Get all objects
result = client.query.get(
class_name,
["content", "source", "category"]
).with_limit(10).do()
result = (
client.query.get(class_name, ["content", "source", "category"]).with_limit(10).do()
)
# Verify results
assert "data" in result, "Query returned no data"
@@ -203,8 +200,9 @@ class TestWeaviateIntegration:
# Verify content
contents = [obj["content"] for obj in objects]
assert any("vector" in content.lower() for content in contents), \
assert any("vector" in content.lower() for content in contents), (
"Expected content not found"
)
finally:
# Cleanup - Delete collection
@@ -234,7 +232,7 @@ class TestWeaviateIntegration:
description="Test metadata preservation",
version="2.0.0",
author="Integration Test Suite",
tags=["test", "integration", "weaviate"]
tags=["test", "integration", "weaviate"],
)
package_path = adaptor.package(sample_skill_dir, tmp_path)
@@ -249,18 +247,17 @@ class TestWeaviateIntegration:
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(
data_object=obj["properties"],
class_name=class_name,
uuid=obj["id"]
data_object=obj["properties"], class_name=class_name, uuid=obj["id"]
)
time.sleep(1)
# Query and verify metadata
result = client.query.get(
class_name,
["source", "version", "author", "tags"]
).with_limit(1).do()
result = (
client.query.get(class_name, ["source", "version", "author", "tags"])
.with_limit(1)
.do()
)
obj = result["data"]["Get"][class_name][0]
assert obj["source"] == "metadata_test", "Source not preserved"
@@ -287,7 +284,9 @@ class TestChromaIntegration:
# Check if Chroma is running
if not check_service_available("http://localhost:8000/api/v1/heartbeat"):
pytest.skip("ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
pytest.skip(
"ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
)
# Connect to ChromaDB
try:
@@ -299,8 +298,7 @@ class TestChromaIntegration:
# Package skill
adaptor = get_adaptor("chroma")
SkillMetadata(
name="chroma_integration_test",
description="Integration test skill for ChromaDB"
name="chroma_integration_test", description="Integration test skill for ChromaDB"
)
package_path = adaptor.package(sample_skill_dir, tmp_path)
@@ -326,9 +324,7 @@ class TestChromaIntegration:
# Add documents
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
)
# Wait for indexing
@@ -340,8 +336,7 @@ class TestChromaIntegration:
# Verify results
assert "documents" in results, "Query returned no documents"
assert len(results["documents"]) > 0, "No documents returned"
assert len(results["documents"]) == len(data["documents"]), \
"Document count mismatch"
assert len(results["documents"]) == len(data["documents"]), "Document count mismatch"
# Verify metadata
assert "metadatas" in results, "Query returned no metadatas"
@@ -350,8 +345,9 @@ class TestChromaIntegration:
assert "category" in first_metadata, "Missing category in metadata"
# Verify content
assert any("vector" in doc.lower() for doc in results["documents"]), \
assert any("vector" in doc.lower() for doc in results["documents"]), (
"Expected content not found"
)
finally:
# Cleanup - Delete collection
@@ -377,8 +373,7 @@ class TestChromaIntegration:
# Package and upload
adaptor = get_adaptor("chroma")
metadata = SkillMetadata(
name="chroma_filter_test",
description="Test filtering capabilities"
name="chroma_filter_test", description="Test filtering capabilities"
)
package_path = adaptor.package(sample_skill_dir, tmp_path)
@@ -390,23 +385,18 @@ class TestChromaIntegration:
try:
collection = client.get_or_create_collection(name=collection_name)
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
)
time.sleep(1)
# Query with category filter
results = collection.get(
where={"category": "getting started"}
)
results = collection.get(where={"category": "getting started"})
# Verify filtering worked
assert len(results["documents"]) > 0, "No documents matched filter"
for metadata in results["metadatas"]:
assert metadata["category"] == "getting started", \
"Filter returned wrong category"
assert metadata["category"] == "getting started", "Filter returned wrong category"
finally:
with contextlib.suppress(Exception):
@@ -428,7 +418,9 @@ class TestQdrantIntegration:
# Check if Qdrant is running
if not check_service_available("http://localhost:6333/"):
pytest.skip("Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)")
pytest.skip(
"Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)"
)
# Connect to Qdrant
try:
@@ -440,8 +432,7 @@ class TestQdrantIntegration:
# Package skill
adaptor = get_adaptor("qdrant")
SkillMetadata(
name="qdrant_integration_test",
description="Integration test skill for Qdrant"
name="qdrant_integration_test", description="Integration test skill for Qdrant"
)
package_path = adaptor.package(sample_skill_dir, tmp_path)
@@ -465,25 +456,21 @@ class TestQdrantIntegration:
# Create collection
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)
# Upload points (with placeholder vectors for testing)
points = []
for point in data["points"]:
points.append(PointStruct(
id=point["id"],
vector=[0.0] * vector_size, # Placeholder vectors
payload=point["payload"]
))
points.append(
PointStruct(
id=point["id"],
vector=[0.0] * vector_size, # Placeholder vectors
payload=point["payload"],
)
)
client.upsert(
collection_name=collection_name,
points=points
)
client.upsert(collection_name=collection_name, points=points)
# Wait for indexing
time.sleep(1)
@@ -493,14 +480,10 @@ class TestQdrantIntegration:
# Verify collection
assert collection_info.points_count > 0, "No points in collection"
assert collection_info.points_count == len(data["points"]), \
"Point count mismatch"
assert collection_info.points_count == len(data["points"]), "Point count mismatch"
# Query - Scroll through points
scroll_result = client.scroll(
collection_name=collection_name,
limit=10
)
scroll_result = client.scroll(collection_name=collection_name, limit=10)
points_list = scroll_result[0]
assert len(points_list) > 0, "No points returned"
@@ -514,8 +497,9 @@ class TestQdrantIntegration:
# Verify content
contents = [p.payload["content"] for p in points_list]
assert any("vector" in content.lower() for content in contents), \
assert any("vector" in content.lower() for content in contents), (
"Expected content not found"
)
finally:
# Cleanup - Delete collection
@@ -527,8 +511,12 @@ class TestQdrantIntegration:
try:
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance, VectorParams, PointStruct,
Filter, FieldCondition, MatchValue
Distance,
VectorParams,
PointStruct,
Filter,
FieldCondition,
MatchValue,
)
except ImportError:
pytest.skip("qdrant-client not installed")
@@ -544,10 +532,7 @@ class TestQdrantIntegration:
# Package and upload
adaptor = get_adaptor("qdrant")
SkillMetadata(
name="qdrant_filter_test",
description="Test filtering capabilities"
)
SkillMetadata(name="qdrant_filter_test", description="Test filtering capabilities")
package_path = adaptor.package(sample_skill_dir, tmp_path)
with open(package_path) as f:
@@ -560,19 +545,16 @@ class TestQdrantIntegration:
# Create and upload
client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)
points = []
for point in data["points"]:
points.append(PointStruct(
id=point["id"],
vector=[0.0] * vector_size,
payload=point["payload"]
))
points.append(
PointStruct(
id=point["id"], vector=[0.0] * vector_size, payload=point["payload"]
)
)
client.upsert(collection_name=collection_name, points=points)
time.sleep(1)
@@ -581,14 +563,9 @@ class TestQdrantIntegration:
scroll_result = client.scroll(
collection_name=collection_name,
scroll_filter=Filter(
must=[
FieldCondition(
key="type",
match=MatchValue(value="reference")
)
]
must=[FieldCondition(key="type", match=MatchValue(value="reference"))]
),
limit=10
limit=10,
)
points_list = scroll_result[0]
@@ -596,8 +573,7 @@ class TestQdrantIntegration:
# Verify filtering worked
assert len(points_list) > 0, "No points matched filter"
for point in points_list:
assert point.payload["type"] == "reference", \
"Filter returned wrong type"
assert point.payload["type"] == "reference", "Filter returned wrong type"
finally:
with contextlib.suppress(Exception):
@@ -607,4 +583,5 @@ class TestQdrantIntegration:
if __name__ == "__main__":
# Run integration tests
import sys
sys.exit(pytest.main([__file__, "-v", "-m", "integration"]))

View File

@@ -192,9 +192,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
# Verify converted URLs are valid
# In real scenario, these would be added to pending_urls and scraped
self.assertTrue(
len(converted_urls) > 0, "Should generate at least one URL to scrape"
)
self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
# Verify no URLs would cause 404 (no anchors in middle of path)
for url in converted_urls:

View File

@@ -464,13 +464,15 @@ class TestValidateConfigTool(unittest.IsolatedAsyncioTestCase):
valid_config = {
"name": "valid-test",
"description": "Test configuration",
"sources": [{
"type": "documentation",
"base_url": "https://example.com/",
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
"rate_limit": 0.5,
"max_pages": 100,
}],
"sources": [
{
"type": "documentation",
"base_url": "https://example.com/",
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"},
"rate_limit": 0.5,
"max_pages": 100,
}
],
}
with open(config_path, "w") as f:
json.dump(valid_config, f)

View File

@@ -19,10 +19,7 @@ import json
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from skill_seekers.cli.multilang_support import (
LanguageDetector,
MultiLanguageManager
)
from skill_seekers.cli.multilang_support import LanguageDetector, MultiLanguageManager
def test_detect_english():
@@ -32,8 +29,8 @@ def test_detect_english():
text = "This is an English document. It contains common English words."
lang_info = detector.detect(text)
assert lang_info.code == 'en'
assert lang_info.name == 'English'
assert lang_info.code == "en"
assert lang_info.name == "English"
assert lang_info.confidence > 0.0
@@ -44,8 +41,8 @@ def test_detect_spanish():
text = "Este es un documento en español. Contiene palabras comunes en español."
lang_info = detector.detect(text)
assert lang_info.code == 'es'
assert lang_info.name == 'Spanish'
assert lang_info.code == "es"
assert lang_info.name == "Spanish"
def test_detect_french():
@@ -55,8 +52,8 @@ def test_detect_french():
text = "Ceci est un document en français. Il contient des mots français communs."
lang_info = detector.detect(text)
assert lang_info.code == 'fr'
assert lang_info.name == 'French'
assert lang_info.code == "fr"
assert lang_info.name == "French"
def test_detect_german():
@@ -66,8 +63,8 @@ def test_detect_german():
text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter."
lang_info = detector.detect(text)
assert lang_info.code == 'de'
assert lang_info.name == 'German'
assert lang_info.code == "de"
assert lang_info.name == "German"
def test_detect_chinese():
@@ -77,33 +74,33 @@ def test_detect_chinese():
text = "这是一个中文文档。它包含常见的中文字符。"
lang_info = detector.detect(text)
assert lang_info.code == 'zh'
assert lang_info.name == 'Chinese'
assert lang_info.code == "zh"
assert lang_info.name == "Chinese"
def test_detect_from_filename_dot_pattern():
"""Test language detection from filename (file.en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README.en.md") == 'en'
assert detector.detect_from_filename("guide.es.md") == 'es'
assert detector.detect_from_filename("doc.fr.md") == 'fr'
assert detector.detect_from_filename("README.en.md") == "en"
assert detector.detect_from_filename("guide.es.md") == "es"
assert detector.detect_from_filename("doc.fr.md") == "fr"
def test_detect_from_filename_underscore_pattern():
"""Test language detection from filename (file_en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README_en.md") == 'en'
assert detector.detect_from_filename("guide_es.md") == 'es'
assert detector.detect_from_filename("README_en.md") == "en"
assert detector.detect_from_filename("guide_es.md") == "es"
def test_detect_from_filename_dash_pattern():
"""Test language detection from filename (file-en.md pattern)."""
detector = LanguageDetector()
assert detector.detect_from_filename("README-en.md") == 'en'
assert detector.detect_from_filename("guide-es.md") == 'es'
assert detector.detect_from_filename("README-en.md") == "en"
assert detector.detect_from_filename("guide-es.md") == "es"
def test_detect_from_filename_no_match():
@@ -118,15 +115,11 @@ def test_add_document_single_language():
"""Test adding documents in single language."""
manager = MultiLanguageManager()
manager.add_document(
"README.md",
"This is an English document.",
{"category": "overview"}
)
manager.add_document("README.md", "This is an English document.", {"category": "overview"})
assert len(manager.get_languages()) == 1
assert 'en' in manager.get_languages()
assert manager.get_document_count('en') == 1
assert "en" in manager.get_languages()
assert manager.get_document_count("en") == 1
def test_add_document_multiple_languages():
@@ -138,9 +131,9 @@ def test_add_document_multiple_languages():
manager.add_document("README.fr.md", "Ceci est français.", {})
assert len(manager.get_languages()) == 3
assert 'en' in manager.get_languages()
assert 'es' in manager.get_languages()
assert 'fr' in manager.get_languages()
assert "en" in manager.get_languages()
assert "es" in manager.get_languages()
assert "fr" in manager.get_languages()
def test_force_language():
@@ -148,15 +141,10 @@ def test_force_language():
manager = MultiLanguageManager()
# Force Spanish despite English content
manager.add_document(
"file.md",
"This is actually English content.",
{},
force_language='es'
)
manager.add_document("file.md", "This is actually English content.", {}, force_language="es")
assert 'es' in manager.get_languages()
assert manager.get_document_count('es') == 1
assert "es" in manager.get_languages()
assert manager.get_document_count("es") == 1
def test_filename_language_priority():
@@ -164,14 +152,10 @@ def test_filename_language_priority():
manager = MultiLanguageManager()
# Filename says Spanish, but content is English
manager.add_document(
"guide.es.md",
"This is English content.",
{}
)
manager.add_document("guide.es.md", "This is English content.", {})
# Should use filename language
assert 'es' in manager.get_languages()
assert "es" in manager.get_languages()
def test_document_count_all():
@@ -183,8 +167,8 @@ def test_document_count_all():
manager.add_document("file3.es.md", "Spanish doc", {})
assert manager.get_document_count() == 3
assert manager.get_document_count('en') == 2
assert manager.get_document_count('es') == 1
assert manager.get_document_count("en") == 2
assert manager.get_document_count("es") == 1
def test_primary_language():
@@ -195,7 +179,7 @@ def test_primary_language():
manager.add_document("file2.es.md", "Spanish doc", {})
# Primary should be first added
assert manager.primary_language == 'en'
assert manager.primary_language == "en"
def test_translation_status():
@@ -208,9 +192,9 @@ def test_translation_status():
status = manager.get_translation_status()
assert status.source_language == 'en'
assert 'es' in status.translated_languages
assert 'fr' in status.translated_languages
assert status.source_language == "en"
assert "es" in status.translated_languages
assert "fr" in status.translated_languages
assert len(status.translated_languages) == 2
@@ -225,17 +209,17 @@ def test_export_by_language():
exports = manager.export_by_language(Path(tmpdir))
assert len(exports) == 2
assert 'en' in exports
assert 'es' in exports
assert "en" in exports
assert "es" in exports
# Check files exist
assert exports['en'].exists()
assert exports['es'].exists()
assert exports["en"].exists()
assert exports["es"].exists()
# Check content
en_data = json.loads(exports['en'].read_text())
assert en_data['language'] == 'en'
assert en_data['document_count'] == 1
en_data = json.loads(exports["en"].read_text())
assert en_data["language"] == "en"
assert en_data["document_count"] == 1
def test_translation_report_generation():
@@ -268,11 +252,11 @@ def test_script_detection():
# English uses Latin script
en_info = detector.detect("This is English")
assert en_info.script == 'Latin'
assert en_info.script == "Latin"
# Chinese uses Han script
zh_info = detector.detect("这是中文")
assert zh_info.script == 'Han'
assert zh_info.script == "Han"
def test_confidence_scoring():
@@ -283,7 +267,7 @@ def test_confidence_scoring():
strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English."
lang_info = detector.detect(strong_en)
assert lang_info.code == 'en'
assert lang_info.code == "en"
assert lang_info.confidence > 0.3 # Should have decent confidence
@@ -294,9 +278,9 @@ def test_metadata_preservation():
metadata = {"category": "guide", "version": "1.0"}
manager.add_document("file.md", "English content", metadata)
docs = manager.documents['en']
docs = manager.documents["en"]
assert len(docs) == 1
assert docs[0]['metadata'] == metadata
assert docs[0]["metadata"] == metadata
if __name__ == "__main__":

View File

@@ -14,9 +14,9 @@ class TestPresetDefinitions:
def test_all_presets_defined(self):
"""Test that all expected presets are defined."""
assert 'quick' in PRESETS
assert 'standard' in PRESETS
assert 'comprehensive' in PRESETS
assert "quick" in PRESETS
assert "standard" in PRESETS
assert "comprehensive" in PRESETS
assert len(PRESETS) == 3
def test_preset_structure(self):
@@ -25,7 +25,7 @@ class TestPresetDefinitions:
assert isinstance(preset, AnalysisPreset)
assert preset.name
assert preset.description
assert preset.depth in ['surface', 'deep', 'full']
assert preset.depth in ["surface", "deep", "full"]
assert isinstance(preset.features, dict)
assert 0 <= preset.enhance_level <= 3
assert preset.estimated_time
@@ -33,45 +33,45 @@ class TestPresetDefinitions:
def test_quick_preset(self):
"""Test quick preset configuration."""
quick = PRESETS['quick']
assert quick.name == 'Quick'
assert quick.depth == 'surface'
quick = PRESETS["quick"]
assert quick.name == "Quick"
assert quick.depth == "surface"
assert quick.enhance_level == 0
assert quick.estimated_time == '1-2 minutes'
assert quick.icon == ''
assert quick.estimated_time == "1-2 minutes"
assert quick.icon == ""
# Quick should disable slow features
assert quick.features['api_reference'] # Essential
assert not quick.features['dependency_graph'] # Slow
assert not quick.features['patterns'] # Slow
assert not quick.features['test_examples'] # Slow
assert not quick.features['how_to_guides'] # Requires AI
assert quick.features['docs'] # Essential
assert quick.features["api_reference"] # Essential
assert not quick.features["dependency_graph"] # Slow
assert not quick.features["patterns"] # Slow
assert not quick.features["test_examples"] # Slow
assert not quick.features["how_to_guides"] # Requires AI
assert quick.features["docs"] # Essential
def test_standard_preset(self):
"""Test standard preset configuration."""
standard = PRESETS['standard']
assert standard.name == 'Standard'
assert standard.depth == 'deep'
standard = PRESETS["standard"]
assert standard.name == "Standard"
assert standard.depth == "deep"
assert standard.enhance_level == 1
assert standard.estimated_time == '5-10 minutes'
assert standard.icon == '🎯'
assert standard.estimated_time == "5-10 minutes"
assert standard.icon == "🎯"
# Standard should enable core features
assert standard.features['api_reference']
assert standard.features['dependency_graph']
assert standard.features['patterns']
assert standard.features['test_examples']
assert not standard.features['how_to_guides'] # Slow
assert standard.features['config_patterns']
assert standard.features['docs']
assert standard.features["api_reference"]
assert standard.features["dependency_graph"]
assert standard.features["patterns"]
assert standard.features["test_examples"]
assert not standard.features["how_to_guides"] # Slow
assert standard.features["config_patterns"]
assert standard.features["docs"]
def test_comprehensive_preset(self):
"""Test comprehensive preset configuration."""
comprehensive = PRESETS['comprehensive']
assert comprehensive.name == 'Comprehensive'
assert comprehensive.depth == 'full'
comprehensive = PRESETS["comprehensive"]
assert comprehensive.name == "Comprehensive"
assert comprehensive.depth == "full"
assert comprehensive.enhance_level == 3
assert comprehensive.estimated_time == '20-60 minutes'
assert comprehensive.icon == '🚀'
assert comprehensive.estimated_time == "20-60 minutes"
assert comprehensive.icon == "🚀"
# Comprehensive should enable ALL features
assert all(comprehensive.features.values())
@@ -81,44 +81,44 @@ class TestPresetManager:
def test_get_preset(self):
"""Test PresetManager.get_preset()."""
quick = PresetManager.get_preset('quick')
quick = PresetManager.get_preset("quick")
assert quick is not None
assert quick.name == 'Quick'
assert quick.depth == 'surface'
assert quick.name == "Quick"
assert quick.depth == "surface"
# Case insensitive
standard = PresetManager.get_preset('STANDARD')
standard = PresetManager.get_preset("STANDARD")
assert standard is not None
assert standard.name == 'Standard'
assert standard.name == "Standard"
def test_get_preset_invalid(self):
"""Test PresetManager.get_preset() with invalid name."""
invalid = PresetManager.get_preset('nonexistent')
invalid = PresetManager.get_preset("nonexistent")
assert invalid is None
def test_list_presets(self):
"""Test PresetManager.list_presets()."""
presets = PresetManager.list_presets()
assert len(presets) == 3
assert 'quick' in presets
assert 'standard' in presets
assert 'comprehensive' in presets
assert "quick" in presets
assert "standard" in presets
assert "comprehensive" in presets
def test_format_preset_help(self):
"""Test PresetManager.format_preset_help()."""
help_text = PresetManager.format_preset_help()
assert 'Available presets:' in help_text
assert '⚡ quick' in help_text
assert '🎯 standard' in help_text
assert '🚀 comprehensive' in help_text
assert '1-2 minutes' in help_text
assert '5-10 minutes' in help_text
assert '20-60 minutes' in help_text
assert "Available presets:" in help_text
assert "⚡ quick" in help_text
assert "🎯 standard" in help_text
assert "🚀 comprehensive" in help_text
assert "1-2 minutes" in help_text
assert "5-10 minutes" in help_text
assert "20-60 minutes" in help_text
def test_get_default_preset(self):
"""Test PresetManager.get_default_preset()."""
default = PresetManager.get_default_preset()
assert default == 'standard'
assert default == "standard"
class TestPresetApplication:
@@ -126,85 +126,85 @@ class TestPresetApplication:
def test_apply_preset_quick(self):
"""Test applying quick preset."""
args = {'directory': '/tmp/test'}
updated = PresetManager.apply_preset('quick', args)
args = {"directory": "/tmp/test"}
updated = PresetManager.apply_preset("quick", args)
assert updated['depth'] == 'surface'
assert updated['enhance_level'] == 0
assert updated['skip_patterns'] # Quick disables patterns
assert updated['skip_dependency_graph'] # Quick disables dep graph
assert updated['skip_test_examples'] # Quick disables tests
assert updated['skip_how_to_guides'] # Quick disables guides
assert not updated['skip_api_reference'] # Quick enables API ref
assert not updated['skip_docs'] # Quick enables docs
assert updated["depth"] == "surface"
assert updated["enhance_level"] == 0
assert updated["skip_patterns"] # Quick disables patterns
assert updated["skip_dependency_graph"] # Quick disables dep graph
assert updated["skip_test_examples"] # Quick disables tests
assert updated["skip_how_to_guides"] # Quick disables guides
assert not updated["skip_api_reference"] # Quick enables API ref
assert not updated["skip_docs"] # Quick enables docs
def test_apply_preset_standard(self):
"""Test applying standard preset."""
args = {'directory': '/tmp/test'}
updated = PresetManager.apply_preset('standard', args)
args = {"directory": "/tmp/test"}
updated = PresetManager.apply_preset("standard", args)
assert updated['depth'] == 'deep'
assert updated['enhance_level'] == 1
assert not updated['skip_patterns'] # Standard enables patterns
assert not updated['skip_dependency_graph'] # Standard enables dep graph
assert not updated['skip_test_examples'] # Standard enables tests
assert updated['skip_how_to_guides'] # Standard disables guides (slow)
assert not updated['skip_api_reference'] # Standard enables API ref
assert not updated['skip_docs'] # Standard enables docs
assert updated["depth"] == "deep"
assert updated["enhance_level"] == 1
assert not updated["skip_patterns"] # Standard enables patterns
assert not updated["skip_dependency_graph"] # Standard enables dep graph
assert not updated["skip_test_examples"] # Standard enables tests
assert updated["skip_how_to_guides"] # Standard disables guides (slow)
assert not updated["skip_api_reference"] # Standard enables API ref
assert not updated["skip_docs"] # Standard enables docs
def test_apply_preset_comprehensive(self):
"""Test applying comprehensive preset."""
args = {'directory': '/tmp/test'}
updated = PresetManager.apply_preset('comprehensive', args)
args = {"directory": "/tmp/test"}
updated = PresetManager.apply_preset("comprehensive", args)
assert updated['depth'] == 'full'
assert updated['enhance_level'] == 3
assert updated["depth"] == "full"
assert updated["enhance_level"] == 3
# Comprehensive enables ALL features
assert not updated['skip_patterns']
assert not updated['skip_dependency_graph']
assert not updated['skip_test_examples']
assert not updated['skip_how_to_guides']
assert not updated['skip_api_reference']
assert not updated['skip_config_patterns']
assert not updated['skip_docs']
assert not updated["skip_patterns"]
assert not updated["skip_dependency_graph"]
assert not updated["skip_test_examples"]
assert not updated["skip_how_to_guides"]
assert not updated["skip_api_reference"]
assert not updated["skip_config_patterns"]
assert not updated["skip_docs"]
def test_cli_overrides_preset(self):
"""Test that CLI args override preset defaults."""
args = {
'directory': '/tmp/test',
'enhance_level': 2, # Override preset default
'skip_patterns': False # Override preset default
"directory": "/tmp/test",
"enhance_level": 2, # Override preset default
"skip_patterns": False, # Override preset default
}
updated = PresetManager.apply_preset('quick', args)
updated = PresetManager.apply_preset("quick", args)
# Preset says enhance_level=0, but CLI said 2
assert updated['enhance_level'] == 2 # CLI wins
assert updated["enhance_level"] == 2 # CLI wins
# Preset says skip_patterns=True (disabled), but CLI said False (enabled)
assert not updated['skip_patterns'] # CLI wins
assert not updated["skip_patterns"] # CLI wins
def test_apply_preset_preserves_args(self):
"""Test that apply_preset preserves existing args."""
args = {
'directory': '/tmp/test',
'output': 'custom_output/',
'languages': 'Python,JavaScript'
"directory": "/tmp/test",
"output": "custom_output/",
"languages": "Python,JavaScript",
}
updated = PresetManager.apply_preset('standard', args)
updated = PresetManager.apply_preset("standard", args)
# Existing args should be preserved
assert updated['directory'] == '/tmp/test'
assert updated['output'] == 'custom_output/'
assert updated['languages'] == 'Python,JavaScript'
assert updated["directory"] == "/tmp/test"
assert updated["output"] == "custom_output/"
assert updated["languages"] == "Python,JavaScript"
def test_apply_preset_invalid(self):
"""Test applying invalid preset raises error."""
args = {'directory': '/tmp/test'}
args = {"directory": "/tmp/test"}
with pytest.raises(ValueError, match="Unknown preset: nonexistent"):
PresetManager.apply_preset('nonexistent', args)
PresetManager.apply_preset("nonexistent", args)
class TestDeprecationWarnings:
@@ -215,12 +215,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=True,
comprehensive=False,
depth=None,
ai_mode='auto'
)
args = argparse.Namespace(quick=True, comprehensive=False, depth=None, ai_mode="auto")
_check_deprecated_flags(args)
@@ -235,12 +230,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=False,
comprehensive=True,
depth=None,
ai_mode='auto'
)
args = argparse.Namespace(quick=False, comprehensive=True, depth=None, ai_mode="auto")
_check_deprecated_flags(args)
@@ -255,12 +245,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=False,
comprehensive=False,
depth='full',
ai_mode='auto'
)
args = argparse.Namespace(quick=False, comprehensive=False, depth="full", ai_mode="auto")
_check_deprecated_flags(args)
@@ -275,12 +260,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=False,
comprehensive=False,
depth=None,
ai_mode='api'
)
args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="api")
_check_deprecated_flags(args)
@@ -295,12 +275,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=True,
comprehensive=False,
depth='surface',
ai_mode='local'
)
args = argparse.Namespace(quick=True, comprehensive=False, depth="surface", ai_mode="local")
_check_deprecated_flags(args)
@@ -317,12 +292,7 @@ class TestDeprecationWarnings:
from skill_seekers.cli.codebase_scraper import _check_deprecated_flags
import argparse
args = argparse.Namespace(
quick=False,
comprehensive=False,
depth=None,
ai_mode='auto'
)
args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="auto")
_check_deprecated_flags(args)
@@ -337,31 +307,31 @@ class TestBackwardCompatibility:
def test_old_flags_still_work(self):
"""Test that old flags still work (with warnings)."""
# --quick flag
args = {'quick': True}
updated = PresetManager.apply_preset('quick', args)
assert updated['depth'] == 'surface'
args = {"quick": True}
updated = PresetManager.apply_preset("quick", args)
assert updated["depth"] == "surface"
# --comprehensive flag
args = {'comprehensive': True}
updated = PresetManager.apply_preset('comprehensive', args)
assert updated['depth'] == 'full'
args = {"comprehensive": True}
updated = PresetManager.apply_preset("comprehensive", args)
assert updated["depth"] == "full"
def test_preset_flag_preferred(self):
"""Test that --preset flag is the recommended way."""
# Using --preset quick
args = {'preset': 'quick'}
updated = PresetManager.apply_preset('quick', args)
assert updated['depth'] == 'surface'
args = {"preset": "quick"}
updated = PresetManager.apply_preset("quick", args)
assert updated["depth"] == "surface"
# Using --preset standard
args = {'preset': 'standard'}
updated = PresetManager.apply_preset('standard', args)
assert updated['depth'] == 'deep'
args = {"preset": "standard"}
updated = PresetManager.apply_preset("standard", args)
assert updated["depth"] == "deep"
# Using --preset comprehensive
args = {'preset': 'comprehensive'}
updated = PresetManager.apply_preset('comprehensive', args)
assert updated['depth'] == 'full'
args = {"preset": "comprehensive"}
updated = PresetManager.apply_preset("comprehensive", args)
assert updated["depth"] == "full"
if __name__ == "__main__":

View File

@@ -19,10 +19,7 @@ import tempfile
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from skill_seekers.cli.quality_metrics import (
QualityAnalyzer,
MetricLevel
)
from skill_seekers.cli.quality_metrics import QualityAnalyzer, MetricLevel
@pytest.fixture
@@ -176,9 +173,9 @@ def test_calculate_statistics(complete_skill_dir):
analyzer = QualityAnalyzer(complete_skill_dir)
stats = analyzer.calculate_statistics()
assert stats['total_files'] > 0
assert stats['markdown_files'] > 0
assert stats['total_words'] > 0
assert stats["total_files"] > 0
assert stats["markdown_files"] > 0
assert stats["total_words"] > 0
def test_overall_score_calculation():
@@ -197,9 +194,7 @@ def test_overall_score_calculation():
coverage = 70.0
health = 85.0
overall = analyzer.calculate_overall_score(
completeness, accuracy, coverage, health
)
overall = analyzer.calculate_overall_score(completeness, accuracy, coverage, health)
assert overall.completeness == 80.0
assert overall.accuracy == 90.0
@@ -218,13 +213,13 @@ def test_grade_assignment():
# Test various scores
score_95 = analyzer.calculate_overall_score(95, 95, 95, 95)
assert score_95.grade == 'A+'
assert score_95.grade == "A+"
score_85 = analyzer.calculate_overall_score(85, 85, 85, 85)
assert score_85.grade in ['A-', 'B+']
assert score_85.grade in ["A-", "B+"]
score_70 = analyzer.calculate_overall_score(70, 70, 70, 70)
assert score_70.grade in ['B-', 'C+', 'C']
assert score_70.grade in ["B-", "C+", "C"]
def test_generate_recommendations():
@@ -240,7 +235,7 @@ def test_generate_recommendations():
recommendations = analyzer.generate_recommendations(score)
assert len(recommendations) > 0
assert any('completeness' in r.lower() for r in recommendations)
assert any("completeness" in r.lower() for r in recommendations)
def test_generate_report(complete_skill_dir):

View File

@@ -28,7 +28,7 @@ class TestRAGChunker:
chunk_overlap=100,
preserve_code_blocks=False,
preserve_paragraphs=False,
min_chunk_size=50
min_chunk_size=50,
)
assert chunker.chunk_size == 1024
@@ -180,13 +180,17 @@ class TestRAGChunker:
# Create SKILL.md
skill_md = skill_dir / "SKILL.md"
skill_md.write_text("# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs.")
skill_md.write_text(
"# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs."
)
# Create references directory with files
references_dir = skill_dir / "references"
references_dir.mkdir()
(references_dir / "getting_started.md").write_text("# Getting Started\n\nQuick start guide.")
(references_dir / "getting_started.md").write_text(
"# Getting Started\n\nQuick start guide."
)
(references_dir / "api.md").write_text("# API Reference\n\nAPI documentation.")
# Chunk skill
@@ -209,7 +213,7 @@ class TestRAGChunker:
{
"chunk_id": "test_0",
"page_content": "Test content",
"metadata": {"source": "test", "chunk_index": 0}
"metadata": {"source": "test", "chunk_index": 0},
}
]
@@ -340,7 +344,7 @@ class TestRAGChunker:
metadata = {
"source": "react-docs",
"category": "hooks",
"url": "https://react.dev/reference/react"
"url": "https://react.dev/reference/react",
}
chunks = chunker.chunk_document(text, metadata)
@@ -379,10 +383,7 @@ class TestRAGChunkerIntegration:
# Convert to LangChain Documents
docs = [
Document(
page_content=chunk["page_content"],
metadata=chunk["metadata"]
)
Document(page_content=chunk["page_content"], metadata=chunk["metadata"])
for chunk in chunks
]
@@ -407,11 +408,7 @@ class TestRAGChunkerIntegration:
# Convert to LlamaIndex TextNodes
nodes = [
TextNode(
text=chunk["page_content"],
metadata=chunk["metadata"],
id_=chunk["chunk_id"]
)
TextNode(text=chunk["page_content"], metadata=chunk["metadata"], id_=chunk["chunk_id"])
for chunk in chunks
]

View File

@@ -13,6 +13,7 @@ pytest.importorskip("mcp.server")
# Check if starlette is available
try:
from starlette.testclient import TestClient
STARLETTE_AVAILABLE = True
except ImportError:
STARLETTE_AVAILABLE = False
@@ -21,8 +22,7 @@ from skill_seekers.mcp.server_fastmcp import mcp
# Skip all tests if starlette is not installed
pytestmark = pytest.mark.skipif(
not STARLETTE_AVAILABLE,
reason="starlette not installed (pip install starlette httpx)"
not STARLETTE_AVAILABLE, reason="starlette not installed (pip install starlette httpx)"
)

Some files were not shown because too many files have changed in this diff Show More