style: Format all Python files with ruff

- Formatted 103 files to comply with ruff format requirements
- No code logic changes, only formatting/whitespace
- Fixes CI formatting check failures
This commit is contained in:
yusyus
2026-02-08 14:42:27 +03:00
parent 6e4f623b9d
commit 0265de5816
103 changed files with 2241 additions and 2627 deletions

View File

@@ -33,9 +33,9 @@ from .runner import BenchmarkRunner
from .models import BenchmarkReport, Metric
__all__ = [
'Benchmark',
'BenchmarkResult',
'BenchmarkRunner',
'BenchmarkReport',
'Metric',
"Benchmark",
"BenchmarkResult",
"BenchmarkRunner",
"BenchmarkReport",
"Metric",
]

View File

@@ -11,12 +11,7 @@ from typing import Any
from collections.abc import Callable
from pathlib import Path
from .models import (
Metric,
TimingResult,
MemoryUsage,
BenchmarkReport
)
from .models import Metric, TimingResult, MemoryUsage, BenchmarkReport
class BenchmarkResult:
@@ -97,7 +92,7 @@ class BenchmarkResult:
memory=self.memory,
metrics=self.metrics,
system_info=self.system_info,
recommendations=self.recommendations
recommendations=self.recommendations,
)
@@ -161,7 +156,7 @@ class Benchmark:
operation=operation,
duration=duration,
iterations=iterations,
avg_duration=duration / iterations if iterations > 1 else duration
avg_duration=duration / iterations if iterations > 1 else duration,
)
self.result.add_timing(timing)
@@ -201,7 +196,7 @@ class Benchmark:
before_mb=mem_before,
after_mb=mem_after,
peak_mb=peak_memory,
allocated_mb=mem_after - mem_before
allocated_mb=mem_after - mem_before,
)
self.result.add_memory(usage)
@@ -212,7 +207,7 @@ class Benchmark:
*args,
operation: str | None = None,
track_memory: bool = False,
**kwargs
**kwargs,
) -> Any:
"""
Measure function execution.
@@ -260,17 +255,16 @@ class Benchmark:
def load_config(path):
return json.load(open(path))
"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args, **kwargs):
return self.measure(
func,
*args,
operation=operation,
track_memory=track_memory,
**kwargs
func, *args, operation=operation, track_memory=track_memory, **kwargs
)
return wrapper
return decorator
def metric(self, name: str, value: float, unit: str):
@@ -285,11 +279,7 @@ class Benchmark:
Examples:
benchmark.metric("pages_per_sec", 12.5, "pages/sec")
"""
metric = Metric(
name=name,
value=value,
unit=unit
)
metric = Metric(name=name, value=value, unit=unit)
self.result.add_metric(metric)
def recommend(self, text: str):
@@ -328,7 +318,7 @@ class Benchmark:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(report.model_dump_json(indent=2))
def analyze(self):
@@ -339,11 +329,7 @@ class Benchmark:
"""
# Analyze timing bottlenecks
if self.result.timings:
sorted_timings = sorted(
self.result.timings,
key=lambda t: t.duration,
reverse=True
)
sorted_timings = sorted(self.result.timings, key=lambda t: t.duration, reverse=True)
slowest = sorted_timings[0]
total_time = sum(t.duration for t in self.result.timings)
@@ -351,7 +337,7 @@ class Benchmark:
if slowest.duration > total_time * 0.5:
self.recommend(
f"Bottleneck: '{slowest.operation}' takes "
f"{slowest.duration:.1f}s ({slowest.duration/total_time*100:.0f}% of total)"
f"{slowest.duration:.1f}s ({slowest.duration / total_time * 100:.0f}% of total)"
)
# Analyze memory usage
@@ -360,8 +346,7 @@ class Benchmark:
if peak > 1000: # >1GB
self.recommend(
f"High memory usage: {peak:.0f}MB peak. "
"Consider processing in batches."
f"High memory usage: {peak:.0f}MB peak. Consider processing in batches."
)
# Check for memory leaks

View File

@@ -14,8 +14,7 @@ class Metric(BaseModel):
value: float = Field(..., description="Metric value")
unit: str = Field(..., description="Unit (seconds, bytes, pages/sec, etc.)")
timestamp: datetime = Field(
default_factory=datetime.utcnow,
description="When metric was recorded"
default_factory=datetime.utcnow, description="When metric was recorded"
)
@@ -48,26 +47,13 @@ class BenchmarkReport(BaseModel):
finished_at: datetime = Field(..., description="Finish time")
total_duration: float = Field(..., description="Total duration in seconds")
timings: list[TimingResult] = Field(
default_factory=list,
description="Timing results"
)
memory: list[MemoryUsage] = Field(
default_factory=list,
description="Memory usage results"
)
metrics: list[Metric] = Field(
default_factory=list,
description="Additional metrics"
)
timings: list[TimingResult] = Field(default_factory=list, description="Timing results")
memory: list[MemoryUsage] = Field(default_factory=list, description="Memory usage results")
metrics: list[Metric] = Field(default_factory=list, description="Additional metrics")
system_info: dict[str, Any] = Field(
default_factory=dict,
description="System information"
)
system_info: dict[str, Any] = Field(default_factory=dict, description="System information")
recommendations: list[str] = Field(
default_factory=list,
description="Optimization recommendations"
default_factory=list, description="Optimization recommendations"
)
@property
@@ -89,14 +75,8 @@ class ComparisonReport(BaseModel):
baseline: BenchmarkReport = Field(..., description="Baseline benchmark")
current: BenchmarkReport = Field(..., description="Current benchmark")
improvements: list[str] = Field(
default_factory=list,
description="Performance improvements"
)
regressions: list[str] = Field(
default_factory=list,
description="Performance regressions"
)
improvements: list[str] = Field(default_factory=list, description="Performance improvements")
regressions: list[str] = Field(default_factory=list, description="Performance regressions")
speedup_factor: float = Field(..., description="Overall speedup factor")
memory_change_mb: float = Field(..., description="Memory usage change (MB)")

View File

@@ -46,10 +46,7 @@ class BenchmarkRunner:
self.output_dir.mkdir(parents=True, exist_ok=True)
def run(
self,
name: str,
benchmark_func: Callable[[Benchmark], None],
save: bool = True
self, name: str, benchmark_func: Callable[[Benchmark], None], save: bool = True
) -> BenchmarkReport:
"""
Run single benchmark.
@@ -83,7 +80,7 @@ class BenchmarkRunner:
filename = f"{name}_{timestamp}.json"
path = self.output_dir / filename
with open(path, 'w') as f:
with open(path, "w") as f:
f.write(report.model_dump_json(indent=2))
print(f"📊 Saved benchmark: {path}")
@@ -91,9 +88,7 @@ class BenchmarkRunner:
return report
def run_suite(
self,
benchmarks: dict[str, Callable[[Benchmark], None]],
save: bool = True
self, benchmarks: dict[str, Callable[[Benchmark], None]], save: bool = True
) -> dict[str, BenchmarkReport]:
"""
Run multiple benchmarks.
@@ -122,11 +117,7 @@ class BenchmarkRunner:
return reports
def compare(
self,
baseline_path: Path,
current_path: Path
) -> ComparisonReport:
def compare(self, baseline_path: Path, current_path: Path) -> ComparisonReport:
"""
Compare two benchmark reports.
@@ -215,7 +206,7 @@ class BenchmarkRunner:
improvements=improvements,
regressions=regressions,
speedup_factor=speedup_factor,
memory_change_mb=memory_change_mb
memory_change_mb=memory_change_mb,
)
def list_benchmarks(self) -> list[dict[str, Any]]:
@@ -237,13 +228,15 @@ class BenchmarkRunner:
with open(path) as f:
data = json.load(f)
benchmarks.append({
"name": data["name"],
"path": str(path),
"started_at": data["started_at"],
"duration": data["total_duration"],
"operations": len(data.get("timings", []))
})
benchmarks.append(
{
"name": data["name"],
"path": str(path),
"started_at": data["started_at"],
"duration": data["total_duration"],
"operations": len(data.get("timings", [])),
}
)
except Exception:
# Skip invalid files
continue

View File

@@ -74,7 +74,7 @@ class SkillAdaptor(ABC):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill for platform (ZIP, tar.gz, etc.).
@@ -282,7 +282,7 @@ class SkillAdaptor(ABC):
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
source_file: str = None
source_file: str = None,
) -> list[tuple[str, dict]]:
"""
Optionally chunk content for RAG platforms.
@@ -326,33 +326,31 @@ class SkillAdaptor(ABC):
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
preserve_code_blocks=preserve_code_blocks,
preserve_paragraphs=True,
min_chunk_size=100 # 100 tokens minimum
min_chunk_size=100, # 100 tokens minimum
)
# Chunk the document
chunks = chunker.chunk_document(
text=content,
metadata=metadata,
source_file=source_file or metadata.get('file', 'unknown')
source_file=source_file or metadata.get("file", "unknown"),
)
# Convert RAGChunker output format to (text, metadata) tuples
result = []
for chunk_dict in chunks:
chunk_text = chunk_dict['page_content']
chunk_text = chunk_dict["page_content"]
chunk_meta = {
**metadata, # Base metadata
**chunk_dict['metadata'], # RAGChunker metadata (chunk_index, etc.)
'is_chunked': True,
'chunk_id': chunk_dict['chunk_id']
**chunk_dict["metadata"], # RAGChunker metadata (chunk_index, etc.)
"is_chunked": True,
"chunk_id": chunk_dict["chunk_id"],
}
result.append((chunk_text, chunk_meta))
return result
def _format_output_path(
self, skill_dir: Path, output_path: Path, suffix: str
) -> Path:
def _format_output_path(self, skill_dir: Path, output_path: Path, suffix: str) -> Path:
"""
Generate standardized output path with intelligent format handling.
@@ -379,11 +377,13 @@ class SkillAdaptor(ABC):
output_str = str(output_path)
# Extract the file extension from suffix (e.g., ".json" from "-langchain.json")
correct_ext = suffix.split('.')[-1] if '.' in suffix else ''
correct_ext = suffix.split(".")[-1] if "." in suffix else ""
if correct_ext and not output_str.endswith(f".{correct_ext}"):
# Replace common incorrect extensions
output_str = output_str.replace(".zip", f".{correct_ext}").replace(".tar.gz", f".{correct_ext}")
output_str = output_str.replace(".zip", f".{correct_ext}").replace(
".tar.gz", f".{correct_ext}"
)
# Ensure platform suffix is present
if not output_str.endswith(suffix):
@@ -395,9 +395,7 @@ class SkillAdaptor(ABC):
return Path(output_str)
def _generate_deterministic_id(
self, content: str, metadata: dict, format: str = "hex"
) -> str:
def _generate_deterministic_id(self, content: str, metadata: dict, format: str = "hex") -> str:
"""
Generate deterministic ID from content and metadata.

View File

@@ -43,11 +43,7 @@ class ChromaAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for Chroma ingestion.
@@ -90,9 +86,9 @@ class ChromaAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to parallel arrays
@@ -120,9 +116,9 @@ class ChromaAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to parallel arrays
@@ -149,7 +145,7 @@ class ChromaAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Chroma.
@@ -183,7 +179,7 @@ class ChromaAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file
@@ -233,7 +229,7 @@ class ChromaAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "chromadb not installed. Run: pip install chromadb"
"message": "chromadb not installed. Run: pip install chromadb",
}
# Load package
@@ -241,8 +237,8 @@ class ChromaAdaptor(SkillAdaptor):
data = json.load(f)
# Determine client type and configuration
persist_directory = kwargs.get('persist_directory')
chroma_url = kwargs.get('chroma_url')
persist_directory = kwargs.get("persist_directory")
chroma_url = kwargs.get("chroma_url")
try:
if persist_directory:
@@ -253,15 +249,15 @@ class ChromaAdaptor(SkillAdaptor):
# Remote HTTP client
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
# Parse URL
if '://' in chroma_url:
parts = chroma_url.split('://')
if "://" in chroma_url:
parts = chroma_url.split("://")
parts[0]
host_port = parts[1]
else:
host_port = chroma_url
if ':' in host_port:
host, port = host_port.rsplit(':', 1)
if ":" in host_port:
host, port = host_port.rsplit(":", 1)
port = int(port)
else:
host = host_port
@@ -276,12 +272,12 @@ class ChromaAdaptor(SkillAdaptor):
except Exception as e:
return {
"success": False,
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server"
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server",
}
# Get or create collection
collection_name = kwargs.get('collection_name', data.get('collection_name', 'skill_docs'))
distance_function = kwargs.get('distance_function', 'cosine')
collection_name = kwargs.get("collection_name", data.get("collection_name", "skill_docs"))
distance_function = kwargs.get("distance_function", "cosine")
try:
# Try to get existing collection
@@ -291,62 +287,57 @@ class ChromaAdaptor(SkillAdaptor):
try:
# Create new collection
metadata = {"hnsw:space": distance_function}
collection = client.create_collection(
name=collection_name,
metadata=metadata
)
collection = client.create_collection(name=collection_name, metadata=metadata)
print(f"✅ Created collection: {collection_name} (distance: {distance_function})")
except Exception as e:
return {
"success": False,
"message": f"Failed to create collection '{collection_name}': {e}"
"message": f"Failed to create collection '{collection_name}': {e}",
}
# Handle embeddings
embedding_function = kwargs.get('embedding_function')
embedding_function = kwargs.get("embedding_function")
try:
if embedding_function == 'openai':
if embedding_function == "openai":
# Generate embeddings with OpenAI
print("🔄 Generating OpenAI embeddings...")
embeddings = self._generate_openai_embeddings(
data['documents'],
api_key=kwargs.get('openai_api_key')
data["documents"], api_key=kwargs.get("openai_api_key")
)
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids'],
embeddings=embeddings
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"],
embeddings=embeddings,
)
elif embedding_function == 'sentence-transformers':
elif embedding_function == "sentence-transformers":
# Use sentence-transformers
print("🔄 Generating sentence-transformer embeddings...")
try:
from chromadb.utils import embedding_functions
ef = embedding_functions.SentenceTransformerEmbeddingFunction()
embeddings = [ef([doc])[0] for doc in data['documents']]
embeddings = [ef([doc])[0] for doc in data["documents"]]
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids'],
embeddings=embeddings
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"],
embeddings=embeddings,
)
except ImportError:
return {
"success": False,
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
}
else:
# No embeddings - Chroma will auto-generate
print("🔄 Using Chroma's default embedding function...")
collection.add(
documents=data['documents'],
metadatas=data['metadatas'],
ids=data['ids']
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
)
count = len(data['documents'])
count = len(data["documents"])
print(f"✅ Uploaded {count} documents to ChromaDB")
print(f"📊 Collection '{collection_name}' now has {collection.count()} total documents")
@@ -355,19 +346,14 @@ class ChromaAdaptor(SkillAdaptor):
"message": f"Uploaded {count} documents to ChromaDB collection '{collection_name}'",
"collection": collection_name,
"count": count,
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None,
}
except Exception as e:
return {
"success": False,
"message": f"Upload failed: {e}"
}
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self,
documents: list[str],
api_key: str = None
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
@@ -380,12 +366,13 @@ class ChromaAdaptor(SkillAdaptor):
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv('OPENAI_API_KEY')
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
@@ -398,14 +385,14 @@ class ChromaAdaptor(SkillAdaptor):
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small" # Cheapest, fastest
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Processed {min(i+batch_size, len(documents))}/{len(documents)}")
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e

View File

@@ -81,7 +81,14 @@ version: {metadata.version}
{content_body}
"""
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file for Claude.

View File

@@ -46,11 +46,7 @@ class FAISSHelpers(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for FAISS ingestion.
@@ -92,9 +88,9 @@ class FAISSHelpers(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to parallel arrays
@@ -121,9 +117,9 @@ class FAISSHelpers(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to parallel arrays
@@ -160,7 +156,7 @@ class FAISSHelpers(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for FAISS.
@@ -193,7 +189,7 @@ class FAISSHelpers(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -86,7 +86,14 @@ See the references directory for complete documentation with examples and best p
# Return plain markdown (NO frontmatter)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into tar.gz file for Gemini.

View File

@@ -29,11 +29,7 @@ class HaystackAdaptor(SkillAdaptor):
DEFAULT_API_ENDPOINT = None # No upload endpoint
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of Haystack Documents.
@@ -73,17 +69,19 @@ class HaystackAdaptor(SkillAdaptor):
content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
documents.append(
{
"content": chunk_text,
"meta": chunk_meta,
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -104,17 +102,19 @@ class HaystackAdaptor(SkillAdaptor):
ref_content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as documents
for chunk_text, chunk_meta in chunks:
documents.append({
"content": chunk_text,
"meta": chunk_meta,
})
documents.append(
{
"content": chunk_text,
"meta": chunk_meta,
}
)
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)
@@ -125,7 +125,7 @@ class HaystackAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Haystack.
@@ -159,7 +159,7 @@ class HaystackAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -29,11 +29,7 @@ class LangChainAdaptor(SkillAdaptor):
DEFAULT_API_ENDPOINT = None # No upload endpoint
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of LangChain Documents.
@@ -73,17 +69,14 @@ class LangChainAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks to documents
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": chunk_meta
})
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -104,17 +97,14 @@ class LangChainAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks to documents
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": chunk_meta
})
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)
@@ -125,7 +115,7 @@ class LangChainAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for LangChain.
@@ -162,7 +152,7 @@ class LangChainAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -42,11 +42,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="hex")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON array of LlamaIndex Nodes.
@@ -88,19 +84,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
nodes.append(
{
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -121,19 +119,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
ref_content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as nodes
for chunk_text, chunk_meta in chunks:
nodes.append({
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
})
nodes.append(
{
"text": chunk_text,
"metadata": chunk_meta,
"id_": self._generate_node_id(chunk_text, chunk_meta),
"embedding": None,
}
)
# Return as formatted JSON
return json.dumps(nodes, indent=2, ensure_ascii=False)
@@ -144,7 +144,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for LlamaIndex.
@@ -178,7 +178,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -81,7 +81,14 @@ Browse the reference files for detailed information on each topic. All files are
# Return pure markdown (no frontmatter, no special formatting)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file with markdown documentation.

View File

@@ -103,7 +103,14 @@ Always prioritize accuracy by consulting the attached documentation files before
# Return plain text instructions (NO frontmatter)
return content_body
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into ZIP file for OpenAI Assistants.

View File

@@ -44,11 +44,7 @@ class QdrantAdaptor(SkillAdaptor):
return self._generate_deterministic_id(content, metadata, format="uuid5")
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as Qdrant collection JSON.
@@ -87,30 +83,35 @@ class QdrantAdaptor(SkillAdaptor):
content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", "SKILL.md")
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
point_id = self._generate_point_id(
chunk_text,
{
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
)
points.append(
{
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
}
})
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -130,30 +131,35 @@ class QdrantAdaptor(SkillAdaptor):
ref_content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as points
for chunk_text, chunk_meta in chunks:
point_id = self._generate_point_id(chunk_text, {
"source": chunk_meta.get("source", metadata.name),
"file": chunk_meta.get("file", ref_file.name)
})
points.append({
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
point_id = self._generate_point_id(
chunk_text,
{
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
)
points.append(
{
"id": point_id,
"vector": None, # User will generate embeddings
"payload": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
}
})
)
# Qdrant configuration
config = {
@@ -184,7 +190,7 @@ class QdrantAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Qdrant.
@@ -217,7 +223,7 @@ class QdrantAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file

View File

@@ -36,7 +36,7 @@ class StreamingAdaptorMixin:
chunk_size: int = 4000,
chunk_overlap: int = 200,
batch_size: int = 100,
progress_callback: callable | None = None
progress_callback: callable | None = None,
) -> Path:
"""
Package skill using streaming ingestion.
@@ -60,9 +60,7 @@ class StreamingAdaptorMixin:
# Initialize streaming ingester
ingester = StreamingIngester(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size
chunk_size=chunk_size, chunk_overlap=chunk_overlap, batch_size=batch_size
)
print(f"\n📊 Streaming ingestion starting...")
@@ -77,9 +75,11 @@ class StreamingAdaptorMixin:
nonlocal last_update
# Update every 10 chunks
if progress.processed_chunks - last_update >= 10:
print(f" {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
f"({progress.chunks_per_second:.1f} chunks/sec)")
print(
f" {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
f"({progress.chunks_per_second:.1f} chunks/sec)"
)
last_update = progress.processed_chunks
if progress_callback:
@@ -97,10 +97,7 @@ class StreamingAdaptorMixin:
# Convert chunks to platform format
print(f"\n📦 Converting to {self.PLATFORM_NAME} format...")
package_data = self._convert_chunks_to_platform_format(
all_chunks,
skill_dir.name
)
package_data = self._convert_chunks_to_platform_format(all_chunks, skill_dir.name)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
@@ -114,8 +111,7 @@ class StreamingAdaptorMixin:
# Write output
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
json.dumps(package_data, indent=2, ensure_ascii=False),
encoding="utf-8"
json.dumps(package_data, indent=2, ensure_ascii=False), encoding="utf-8"
)
print(f"✅ Package created: {output_path}")
@@ -124,9 +120,7 @@ class StreamingAdaptorMixin:
return output_path
def _convert_chunks_to_platform_format(
self,
chunks: list[tuple[str, dict]],
skill_name: str
self, chunks: list[tuple[str, dict]], skill_name: str
) -> dict:
"""
Convert chunks to platform-specific format.
@@ -156,14 +150,11 @@ class StreamingAdaptorMixin:
"metadatas": metadatas,
"ids": ids,
"total_chunks": len(chunks),
"streaming": True
"streaming": True,
}
def estimate_chunks(
self,
skill_dir: Path,
chunk_size: int = 4000,
chunk_overlap: int = 200
self, skill_dir: Path, chunk_size: int = 4000, chunk_overlap: int = 200
) -> dict[str, Any]:
"""
Estimate chunking for a skill directory.
@@ -179,10 +170,7 @@ class StreamingAdaptorMixin:
Estimation statistics
"""
skill_dir = Path(skill_dir)
StreamingIngester(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
StreamingIngester(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# Count files and estimate chunks
total_docs = 0
@@ -201,11 +189,9 @@ class StreamingAdaptorMixin:
total_chars += char_count
estimated_chunks += chunk_count
file_stats.append({
"file": "SKILL.md",
"chars": char_count,
"estimated_chunks": chunk_count
})
file_stats.append(
{"file": "SKILL.md", "chars": char_count, "estimated_chunks": chunk_count}
)
# Reference files
refs_dir = skill_dir / "references"
@@ -214,17 +200,21 @@ class StreamingAdaptorMixin:
if ref_file.is_file() and not ref_file.name.startswith("."):
content = ref_file.read_text(encoding="utf-8")
char_count = len(content)
chunk_count = max(1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1)
chunk_count = max(
1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1
)
total_docs += 1
total_chars += char_count
estimated_chunks += chunk_count
file_stats.append({
"file": ref_file.name,
"chars": char_count,
"estimated_chunks": chunk_count
})
file_stats.append(
{
"file": ref_file.name,
"chars": char_count,
"estimated_chunks": chunk_count,
}
)
return {
"skill_name": skill_dir.name,
@@ -235,7 +225,7 @@ class StreamingAdaptorMixin:
"chunk_overlap": chunk_overlap,
"file_stats": file_stats,
"estimated_memory_mb": (total_chars * 2) / (1024 * 1024), # UTF-8 estimate
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100,
}
@@ -251,25 +241,27 @@ class StreamingLangChainAdaptor(StreamingAdaptorMixin):
documents = []
for chunk_text, chunk_meta in chunks:
documents.append({
"page_content": chunk_text,
"metadata": {
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_id": chunk_meta["chunk_id"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", "1.0.0"),
documents.append(
{
"page_content": chunk_text,
"metadata": {
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_id": chunk_meta["chunk_id"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", "1.0.0"),
},
}
})
)
return {
"documents": documents,
"total_chunks": len(chunks),
"streaming": True,
"format": "LangChain Document"
"format": "LangChain Document",
}
@@ -287,14 +279,16 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
for chunk_text, chunk_meta in chunks:
documents.append(chunk_text)
metadatas.append({
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
})
metadatas.append(
{
"source": chunk_meta["source"],
"category": chunk_meta["category"],
"file": chunk_meta["file"],
"chunk_index": chunk_meta["chunk_index"],
"total_chunks": chunk_meta["total_chunks"],
"type": chunk_meta.get("type", "documentation"),
}
)
ids.append(chunk_meta["chunk_id"])
return {
@@ -303,7 +297,7 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
"ids": ids,
"collection_name": skill_name.replace("_", "-"),
"total_chunks": len(chunks),
"streaming": True
"streaming": True,
}
@@ -339,11 +333,7 @@ def demo_streaming():
print("=" * 60)
output = adaptor.package_streaming(
skill_dir,
Path("output"),
chunk_size=2000,
chunk_overlap=100,
batch_size=50
skill_dir, Path("output"), chunk_size=2000, chunk_overlap=100, batch_size=50
)
print(f"\n✅ Complete! Output: {output}")

View File

@@ -104,11 +104,7 @@ class WeaviateAdaptor(SkillAdaptor):
}
def format_skill_md(
self,
skill_dir: Path,
metadata: SkillMetadata,
enable_chunking: bool = False,
**kwargs
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for Weaviate ingestion.
@@ -148,24 +144,26 @@ class WeaviateAdaptor(SkillAdaptor):
content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file="SKILL.md"
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
})
objects.append(
{
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", "overview"),
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
},
}
)
# Convert all reference files using base helper method
for ref_file, ref_content in self._iterate_references(skill_dir):
@@ -186,24 +184,26 @@ class WeaviateAdaptor(SkillAdaptor):
ref_content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
source_file=ref_file.name
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
)
# Add all chunks as objects
for chunk_text, chunk_meta in chunks:
objects.append({
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
})
objects.append(
{
"id": self._generate_uuid(chunk_text, chunk_meta),
"properties": {
"content": chunk_text,
"source": chunk_meta.get("source", metadata.name),
"category": chunk_meta.get("category", category),
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
},
}
)
# Generate schema
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
@@ -222,7 +222,7 @@ class WeaviateAdaptor(SkillAdaptor):
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
preserve_code_blocks: bool = True
preserve_code_blocks: bool = True,
) -> Path:
"""
Package skill into JSON file for Weaviate.
@@ -258,7 +258,7 @@ class WeaviateAdaptor(SkillAdaptor):
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
# Write to file
@@ -310,7 +310,7 @@ class WeaviateAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "weaviate-client not installed. Run: pip install weaviate-client"
"message": "weaviate-client not installed. Run: pip install weaviate-client",
}
# Load package
@@ -319,16 +319,16 @@ class WeaviateAdaptor(SkillAdaptor):
# Connect to Weaviate
try:
if kwargs.get('use_cloud') and api_key:
if kwargs.get("use_cloud") and api_key:
# Weaviate Cloud
print(f"🌐 Connecting to Weaviate Cloud: {kwargs.get('cluster_url')}")
client = weaviate.Client(
url=kwargs.get('cluster_url'),
auth_client_secret=weaviate.AuthApiKey(api_key=api_key)
url=kwargs.get("cluster_url"),
auth_client_secret=weaviate.AuthApiKey(api_key=api_key),
)
else:
# Local Weaviate instance
weaviate_url = kwargs.get('weaviate_url', 'http://localhost:8080')
weaviate_url = kwargs.get("weaviate_url", "http://localhost:8080")
print(f"🌐 Connecting to Weaviate at: {weaviate_url}")
client = weaviate.Client(url=weaviate_url)
@@ -336,69 +336,67 @@ class WeaviateAdaptor(SkillAdaptor):
if not client.is_ready():
return {
"success": False,
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest"
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest",
}
except Exception as e:
return {
"success": False,
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials."
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials.",
}
# Create schema
try:
client.schema.create_class(data['schema'])
client.schema.create_class(data["schema"])
print(f"✅ Created schema: {data['class_name']}")
except Exception as e:
if "already exists" in str(e).lower():
print(f" Schema already exists: {data['class_name']}")
else:
return {
"success": False,
"message": f"Schema creation failed: {e}"
}
return {"success": False, "message": f"Schema creation failed: {e}"}
# Handle embeddings
embedding_function = kwargs.get('embedding_function')
embedding_function = kwargs.get("embedding_function")
try:
with client.batch as batch:
batch.batch_size = 100
if embedding_function == 'openai':
if embedding_function == "openai":
# Generate embeddings with OpenAI
print("🔄 Generating OpenAI embeddings and uploading...")
embeddings = self._generate_openai_embeddings(
[obj['properties']['content'] for obj in data['objects']],
api_key=kwargs.get('openai_api_key')
[obj["properties"]["content"] for obj in data["objects"]],
api_key=kwargs.get("openai_api_key"),
)
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id'],
vector=embeddings[i]
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
elif embedding_function == 'sentence-transformers':
elif embedding_function == "sentence-transformers":
# Use sentence-transformers
print("🔄 Generating sentence-transformer embeddings and uploading...")
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
contents = [obj['properties']['content'] for obj in data['objects']]
model = SentenceTransformer("all-MiniLM-L6-v2")
contents = [obj["properties"]["content"] for obj in data["objects"]]
embeddings = model.encode(contents, show_progress_bar=True).tolist()
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id'],
vector=embeddings[i]
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
if (i + 1) % 100 == 0:
@@ -407,42 +405,37 @@ class WeaviateAdaptor(SkillAdaptor):
except ImportError:
return {
"success": False,
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
}
else:
# No embeddings - Weaviate will use its configured vectorizer
print("🔄 Uploading objects (Weaviate will generate embeddings)...")
for i, obj in enumerate(data['objects']):
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj['properties'],
class_name=data['class_name'],
uuid=obj['id']
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
)
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
count = len(data['objects'])
count = len(data["objects"])
print(f"✅ Upload complete! {count} objects added to Weaviate")
return {
"success": True,
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
"class_name": data['class_name'],
"count": count
"class_name": data["class_name"],
"count": count,
}
except Exception as e:
return {
"success": False,
"message": f"Upload failed: {e}"
}
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self,
documents: list[str],
api_key: str = None
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
@@ -455,12 +448,13 @@ class WeaviateAdaptor(SkillAdaptor):
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv('OPENAI_API_KEY')
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
@@ -473,14 +467,16 @@ class WeaviateAdaptor(SkillAdaptor):
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i:i+batch_size]
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small" # Cheapest, fastest
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Generated {min(i+batch_size, len(documents))}/{len(documents)} embeddings")
print(
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
)
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e

View File

@@ -101,10 +101,38 @@ class ArchitecturalPatternDetector:
# Web Frameworks
"Django": ["django", "manage.py", "settings.py", "urls.py"],
"Flask": ["flask", "app.py", "wsgi.py"],
"Spring": ["springframework", "org.springframework", "@Controller", "@Service", "@Repository"],
"ASP.NET": ["Microsoft.AspNetCore", "System.Web", "Controllers", "Models", "Views", ".cshtml", "Startup.cs"],
"Rails": ["rails", "action", "app/models", "app/views", "app/controllers", "config/routes.rb"],
"Angular": ["@angular", "angular", "app.module.ts", "@Component", "@Injectable", "angular.json"],
"Spring": [
"springframework",
"org.springframework",
"@Controller",
"@Service",
"@Repository",
],
"ASP.NET": [
"Microsoft.AspNetCore",
"System.Web",
"Controllers",
"Models",
"Views",
".cshtml",
"Startup.cs",
],
"Rails": [
"rails",
"action",
"app/models",
"app/views",
"app/controllers",
"config/routes.rb",
],
"Angular": [
"@angular",
"angular",
"app.module.ts",
"@Component",
"@Injectable",
"angular.json",
],
"React": ["react", "package.json", "components"],
"Vue.js": ["vue", ".vue", "components"],
"Express": ["express", "app.js", "routes"],
@@ -208,7 +236,9 @@ class ArchitecturalPatternDetector:
# Create searchable import string
import_content = " ".join(all_imports)
logger.debug(f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection")
logger.debug(
f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection"
)
# Also check actual directory structure for game engine markers
# (project.godot, .unity, .uproject are config files, not in analyzed files)
@@ -245,7 +275,9 @@ class ArchitecturalPatternDetector:
# Check in file paths, directory structure, AND imports
path_matches = sum(1 for marker in markers if marker.lower() in all_content.lower())
dir_matches = sum(1 for marker in markers if marker.lower() in dir_content.lower())
import_matches = sum(1 for marker in markers if marker.lower() in import_content.lower())
import_matches = sum(
1 for marker in markers if marker.lower() in import_content.lower()
)
# Strategy: Prioritize import-based detection (more accurate)
# If we have import matches, they're strong signals - use them alone
@@ -257,7 +289,9 @@ class ArchitecturalPatternDetector:
elif (path_matches + dir_matches) >= 2:
# Path/directory-based detection (requires 2+ matches)
detected.append(framework)
logger.info(f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})")
logger.info(
f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})"
)
return detected

View File

@@ -77,7 +77,9 @@ def run_embedding_benchmark(runner, config):
with bench.timer("batch_embedding"), bench.memory("batch_embedding"):
embeddings = generator.generate_batch(texts, model=model)
bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec")
bench.metric(
"embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec"
)
name = config.get("name", "embedding-benchmark")
report = runner.run(name, benchmark_func)
@@ -97,7 +99,7 @@ def run_storage_benchmark(runner, config):
storage = get_storage_adaptor(provider, bucket=bucket)
# Create test file
with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
with NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write("Test data" * 1000)
test_file = Path(f.name)
@@ -128,10 +130,7 @@ def compare_command(args):
"""Compare two benchmarks."""
runner = BenchmarkRunner()
comparison = runner.compare(
baseline_path=Path(args.baseline),
current_path=Path(args.current)
)
comparison = runner.compare(baseline_path=Path(args.baseline), current_path=Path(args.current))
print(f"\n📊 Comparison: {comparison.name}\n")
print(f"Overall: {comparison.overall_improvement}\n")
@@ -213,7 +212,7 @@ def cleanup_command(args):
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Performance benchmarking suite',
description="Performance benchmarking suite",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -233,54 +232,46 @@ Examples:
# Cleanup old benchmarks
skill-seekers-benchmark cleanup --keep 5
"""
""",
)
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Run command
run_parser = subparsers.add_parser('run', help='Run benchmark')
run_parser.add_argument('--config', required=True, help='Benchmark config file')
run_parser = subparsers.add_parser("run", help="Run benchmark")
run_parser.add_argument("--config", required=True, help="Benchmark config file")
run_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Output directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Output directory (default: benchmarks)"
)
# Compare command
compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks')
compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark')
compare_parser.add_argument('--current', required=True, help='Current benchmark')
compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks")
compare_parser.add_argument("--baseline", required=True, help="Baseline benchmark")
compare_parser.add_argument("--current", required=True, help="Current benchmark")
compare_parser.add_argument(
'--fail-on-regression',
action='store_true',
help='Exit with error if regressions detected'
"--fail-on-regression", action="store_true", help="Exit with error if regressions detected"
)
# List command
list_parser = subparsers.add_parser('list', help='List saved benchmarks')
list_parser = subparsers.add_parser("list", help="List saved benchmarks")
list_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Benchmark directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
)
# Show command
show_parser = subparsers.add_parser('show', help='Show benchmark details')
show_parser.add_argument('path', help='Path to benchmark file')
show_parser = subparsers.add_parser("show", help="Show benchmark details")
show_parser.add_argument("path", help="Path to benchmark file")
# Cleanup command
cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks')
cleanup_parser = subparsers.add_parser("cleanup", help="Cleanup old benchmarks")
cleanup_parser.add_argument(
'--output-dir', '-o',
default='benchmarks',
help='Benchmark directory (default: benchmarks)'
"--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)"
)
cleanup_parser.add_argument(
'--keep',
"--keep",
type=int,
default=5,
help='Number of latest benchmarks to keep per name (default: 5)'
help="Number of latest benchmarks to keep per name (default: 5)",
)
args = parser.parse_args()
@@ -290,20 +281,20 @@ Examples:
sys.exit(1)
try:
if args.command == 'run':
if args.command == "run":
run_command(args)
elif args.command == 'compare':
elif args.command == "compare":
compare_command(args)
elif args.command == 'list':
elif args.command == "list":
list_command(args)
elif args.command == 'show':
elif args.command == "show":
show_command(args)
elif args.command == 'cleanup':
elif args.command == "cleanup":
cleanup_command(args)
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -15,18 +15,13 @@ from .storage import get_storage_adaptor
def upload_command(args):
"""Handle upload subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
if Path(args.local_path).is_dir():
print(f"📁 Uploading directory: {args.local_path}")
uploaded_files = adaptor.upload_directory(
args.local_path,
args.remote_path,
exclude_patterns=args.exclude
args.local_path, args.remote_path, exclude_patterns=args.exclude
)
print(f"✅ Uploaded {len(uploaded_files)} files")
if args.verbose:
@@ -41,19 +36,13 @@ def upload_command(args):
def download_command(args):
"""Handle download subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
# Check if remote path is a directory (ends with /)
if args.remote_path.endswith('/'):
if args.remote_path.endswith("/"):
print(f"📁 Downloading directory: {args.remote_path}")
downloaded_files = adaptor.download_directory(
args.remote_path,
args.local_path
)
downloaded_files = adaptor.download_directory(args.remote_path, args.local_path)
print(f"✅ Downloaded {len(downloaded_files)} files")
if args.verbose:
for file_path in downloaded_files:
@@ -67,10 +56,7 @@ def download_command(args):
def list_command(args):
"""Handle list subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"📋 Listing files: {args.prefix or '(root)'}")
@@ -99,15 +85,12 @@ def list_command(args):
def delete_command(args):
"""Handle delete subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
if not args.force:
response = input(f"⚠️ Delete {args.remote_path}? [y/N]: ")
if response.lower() != 'y':
if response.lower() != "y":
print("❌ Deletion cancelled")
return
@@ -119,10 +102,7 @@ def delete_command(args):
def url_command(args):
"""Handle url subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"🔗 Generating signed URL: {args.remote_path}")
@@ -134,10 +114,7 @@ def url_command(args):
def copy_command(args):
"""Handle copy subcommand."""
adaptor = get_storage_adaptor(
args.provider,
bucket=args.bucket,
container=args.container,
**parse_extra_args(args.extra)
args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra)
)
print(f"📋 Copying: {args.source_path}{args.dest_path}")
@@ -147,7 +124,7 @@ def copy_command(args):
def format_size(size_bytes: int) -> str:
"""Format file size in human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
for unit in ["B", "KB", "MB", "GB", "TB"]:
if size_bytes < 1024.0:
return f"{size_bytes:.1f}{unit}"
size_bytes /= 1024.0
@@ -161,11 +138,11 @@ def parse_extra_args(extra: list | None) -> dict:
result = {}
for arg in extra:
if '=' in arg:
key, value = arg.split('=', 1)
result[key.lstrip('-')] = value
if "=" in arg:
key, value = arg.split("=", 1)
result[key.lstrip("-")] = value
else:
result[arg.lstrip('-')] = True
result[arg.lstrip("-")] = True
return result
@@ -173,7 +150,7 @@ def parse_extra_args(extra: list | None) -> dict:
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Cloud storage operations for Skill Seekers',
description="Cloud storage operations for Skill Seekers",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -197,114 +174,66 @@ Provider-specific options:
S3: --region=us-west-2 --endpoint-url=https://...
GCS: --project=my-project --credentials-path=/path/to/creds.json
Azure: --account-name=myaccount --account-key=...
"""
""",
)
# Global arguments
parser.add_argument(
'--provider',
choices=['s3', 'gcs', 'azure'],
required=True,
help='Cloud storage provider'
)
parser.add_argument(
'--bucket',
help='S3/GCS bucket name (for S3/GCS)'
)
parser.add_argument(
'--container',
help='Azure container name (for Azure)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Verbose output'
"--provider", choices=["s3", "gcs", "azure"], required=True, help="Cloud storage provider"
)
parser.add_argument("--bucket", help="S3/GCS bucket name (for S3/GCS)")
parser.add_argument("--container", help="Azure container name (for Azure)")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Upload command
upload_parser = subparsers.add_parser('upload', help='Upload file or directory')
upload_parser.add_argument('local_path', help='Local file or directory path')
upload_parser.add_argument('remote_path', help='Remote path in cloud storage')
upload_parser = subparsers.add_parser("upload", help="Upload file or directory")
upload_parser.add_argument("local_path", help="Local file or directory path")
upload_parser.add_argument("remote_path", help="Remote path in cloud storage")
upload_parser.add_argument(
'--exclude',
action='append',
help='Glob patterns to exclude (for directories)'
)
upload_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--exclude", action="append", help="Glob patterns to exclude (for directories)"
)
upload_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Download command
download_parser = subparsers.add_parser('download', help='Download file or directory')
download_parser.add_argument('remote_path', help='Remote path in cloud storage')
download_parser.add_argument('local_path', help='Local destination path')
download_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
)
download_parser = subparsers.add_parser("download", help="Download file or directory")
download_parser.add_argument("remote_path", help="Remote path in cloud storage")
download_parser.add_argument("local_path", help="Local destination path")
download_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# List command
list_parser = subparsers.add_parser('list', help='List files in cloud storage')
list_parser = subparsers.add_parser("list", help="List files in cloud storage")
list_parser.add_argument("--prefix", default="", help="Prefix to filter files")
list_parser.add_argument(
'--prefix',
default='',
help='Prefix to filter files'
)
list_parser.add_argument(
'--max-results',
type=int,
default=1000,
help='Maximum number of results'
)
list_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--max-results", type=int, default=1000, help="Maximum number of results"
)
list_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Delete command
delete_parser = subparsers.add_parser('delete', help='Delete file from cloud storage')
delete_parser.add_argument('remote_path', help='Remote path in cloud storage')
delete_parser = subparsers.add_parser("delete", help="Delete file from cloud storage")
delete_parser.add_argument("remote_path", help="Remote path in cloud storage")
delete_parser.add_argument(
'--force', '-f',
action='store_true',
help='Skip confirmation prompt'
)
delete_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
"--force", "-f", action="store_true", help="Skip confirmation prompt"
)
delete_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# URL command
url_parser = subparsers.add_parser('url', help='Generate signed URL')
url_parser.add_argument('remote_path', help='Remote path in cloud storage')
url_parser = subparsers.add_parser("url", help="Generate signed URL")
url_parser.add_argument("remote_path", help="Remote path in cloud storage")
url_parser.add_argument(
'--expires-in',
"--expires-in",
type=int,
default=3600,
help='URL expiration time in seconds (default: 3600)'
)
url_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
help="URL expiration time in seconds (default: 3600)",
)
url_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
# Copy command
copy_parser = subparsers.add_parser('copy', help='Copy file within cloud storage')
copy_parser.add_argument('source_path', help='Source path')
copy_parser.add_argument('dest_path', help='Destination path')
copy_parser.add_argument(
'extra',
nargs='*',
help='Provider-specific options (--key=value)'
)
copy_parser = subparsers.add_parser("copy", help="Copy file within cloud storage")
copy_parser.add_argument("source_path", help="Source path")
copy_parser.add_argument("dest_path", help="Destination path")
copy_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)")
args = parser.parse_args()
@@ -313,26 +242,26 @@ Provider-specific options:
sys.exit(1)
# Validate bucket/container based on provider
if args.provider in ['s3', 'gcs'] and not args.bucket:
if args.provider in ["s3", "gcs"] and not args.bucket:
print(f"❌ Error: --bucket is required for {args.provider.upper()}", file=sys.stderr)
sys.exit(1)
elif args.provider == 'azure' and not args.container:
elif args.provider == "azure" and not args.container:
print("❌ Error: --container is required for Azure", file=sys.stderr)
sys.exit(1)
try:
# Execute command
if args.command == 'upload':
if args.command == "upload":
upload_command(args)
elif args.command == 'download':
elif args.command == "download":
download_command(args)
elif args.command == 'list':
elif args.command == "list":
list_command(args)
elif args.command == 'delete':
elif args.command == "delete":
delete_command(args)
elif args.command == 'url':
elif args.command == "url":
url_command(args)
elif args.command == 'copy':
elif args.command == "copy":
copy_command(args)
except FileNotFoundError as e:
@@ -342,9 +271,10 @@ Provider-specific options:
print(f"❌ Error: {e}", file=sys.stderr)
if args.verbose:
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -376,8 +376,8 @@ class CodeAnalyzer:
for match in re.finditer(pattern, content):
module = match.group(1)
# Extract package name (before first /)
package = module.split('/')[0]
if package and not package.startswith('.'): # Skip relative imports
package = module.split("/")[0]
if package and not package.startswith("."): # Skip relative imports
imports.append(package)
return {
@@ -694,11 +694,11 @@ class CodeAnalyzer:
for match in re.finditer(using_pattern, content):
namespace = match.group(1).strip()
# Skip using aliases (using Foo = Bar.Baz)
if '=' not in namespace:
if "=" not in namespace:
# Extract base namespace (first 1-2 segments)
parts = namespace.split('.')
parts = namespace.split(".")
if len(parts) >= 2:
base_ns = '.'.join(parts[:2])
base_ns = ".".join(parts[:2])
imports.append(base_ns)
elif len(parts) == 1:
imports.append(parts[0])
@@ -1130,10 +1130,10 @@ class CodeAnalyzer:
for match in re.finditer(import_pattern, content):
import_path = match.group(1).strip()
# Extract package name (first 2-3 segments for framework detection)
parts = import_path.split('.')
parts = import_path.split(".")
if len(parts) >= 2:
# Get base package (e.g., "org.springframework" from "org.springframework.boot.SpringApplication")
package = '.'.join(parts[:2])
package = ".".join(parts[:2])
imports.append(package)
return {
@@ -1303,7 +1303,7 @@ class CodeAnalyzer:
for match in re.finditer(require_pattern, content):
module = match.group(1)
# Extract gem name (before first /)
gem = module.split('/')[0]
gem = module.split("/")[0]
imports.append(gem)
return {
@@ -1443,7 +1443,7 @@ class CodeAnalyzer:
for match in re.finditer(use_pattern, content):
namespace = match.group(1).strip()
# Extract vendor name (first segment)
parts = namespace.split('\\')
parts = namespace.split("\\")
if parts:
vendor = parts[0]
imports.append(vendor.lower())

View File

@@ -1036,11 +1036,15 @@ def analyze_codebase(
# Save summary statistics
summary_json = pattern_output / "summary.json"
with open(summary_json, "w", encoding="utf-8") as f:
json.dump({
"statistics": stats,
"thresholds": multi_level["thresholds"],
"files_analyzed": len(pattern_results),
}, f, indent=2)
json.dump(
{
"statistics": stats,
"thresholds": multi_level["thresholds"],
"files_analyzed": len(pattern_results),
},
f,
indent=2,
)
# Log results with breakdown by confidence
logger.info(f"✅ Detected {stats['total']} patterns in {len(pattern_results)} files")
@@ -1931,21 +1935,15 @@ def _check_deprecated_flags(args):
"⚠️ DEPRECATED: --ai-mode local → use --enhance-level without API key instead"
)
elif args.ai_mode == "none":
warnings.append(
"⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead"
)
warnings.append("⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead")
# Deprecated: --quick flag
if hasattr(args, "quick") and args.quick:
warnings.append(
"⚠️ DEPRECATED: --quick → use --preset quick instead"
)
warnings.append("⚠️ DEPRECATED: --quick → use --preset quick instead")
# Deprecated: --comprehensive flag
if hasattr(args, "comprehensive") and args.comprehensive:
warnings.append(
"⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead"
)
warnings.append("⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead")
# Show warnings if any found
if warnings:
@@ -2000,24 +1998,22 @@ Examples:
parser.add_argument(
"--preset",
choices=["quick", "standard", "comprehensive"],
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
)
parser.add_argument(
"--preset-list",
action="store_true",
help="Show available presets and exit"
"--preset-list", action="store_true", help="Show available presets and exit"
)
# Legacy preset flags (kept for backward compatibility)
parser.add_argument(
"--quick",
action="store_true",
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
)
parser.add_argument(
"--comprehensive",
action="store_true",
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead"
help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead",
)
parser.add_argument(
@@ -2129,6 +2125,7 @@ Examples:
# Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation
if "--preset-list" in sys.argv:
from skill_seekers.cli.presets import PresetManager
print(PresetManager.format_preset_help())
return 0
@@ -2155,6 +2152,7 @@ Examples:
# Apply preset using PresetManager
if preset_name:
from skill_seekers.cli.presets import PresetManager
try:
preset_args = PresetManager.apply_preset(preset_name, vars(args))
# Update args with preset values
@@ -2162,9 +2160,7 @@ Examples:
setattr(args, key, value)
preset = PresetManager.get_preset(preset_name)
logger.info(
f"{preset.icon} {preset.name} analysis mode: {preset.description}"
)
logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}")
except ValueError as e:
logger.error(f"{e}")
return 1

View File

@@ -19,6 +19,7 @@ import numpy as np
@dataclass
class EmbeddingConfig:
"""Configuration for embedding generation."""
provider: str # 'openai', 'cohere', 'huggingface', 'local'
model: str
dimension: int
@@ -31,6 +32,7 @@ class EmbeddingConfig:
@dataclass
class EmbeddingResult:
"""Result of embedding generation."""
embeddings: list[list[float]]
metadata: dict[str, Any] = field(default_factory=dict)
cached_count: int = 0
@@ -42,6 +44,7 @@ class EmbeddingResult:
@dataclass
class CostTracker:
"""Track embedding generation costs."""
total_tokens: int = 0
total_requests: int = 0
cache_hits: int = 0
@@ -64,12 +67,12 @@ class CostTracker:
cache_rate = (self.cache_hits / self.total_requests * 100) if self.total_requests > 0 else 0
return {
'total_requests': self.total_requests,
'total_tokens': self.total_tokens,
'cache_hits': self.cache_hits,
'cache_misses': self.cache_misses,
'cache_rate': f"{cache_rate:.1f}%",
'estimated_cost': f"${self.estimated_cost:.4f}"
"total_requests": self.total_requests,
"total_tokens": self.total_tokens,
"cache_hits": self.cache_hits,
"cache_misses": self.cache_misses,
"cache_rate": f"{cache_rate:.1f}%",
"estimated_cost": f"${self.estimated_cost:.4f}",
}
@@ -97,18 +100,18 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
# Pricing per 1M tokens (as of 2026)
PRICING = {
'text-embedding-ada-002': 0.10,
'text-embedding-3-small': 0.02,
'text-embedding-3-large': 0.13,
"text-embedding-ada-002": 0.10,
"text-embedding-3-small": 0.02,
"text-embedding-3-large": 0.13,
}
DIMENSIONS = {
'text-embedding-ada-002': 1536,
'text-embedding-3-small': 1536,
'text-embedding-3-large': 3072,
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
}
def __init__(self, model: str = 'text-embedding-ada-002', api_key: str | None = None):
def __init__(self, model: str = "text-embedding-ada-002", api_key: str | None = None):
"""Initialize OpenAI provider."""
self.model = model
self.api_key = api_key
@@ -119,9 +122,12 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(api_key=self.api_key)
except ImportError:
raise ImportError("OpenAI package not installed. Install with: pip install openai") from None
raise ImportError(
"OpenAI package not installed. Install with: pip install openai"
) from None
return self._client
def generate_embeddings(self, texts: list[str]) -> list[list[float]]:
@@ -130,10 +136,7 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
embeddings = []
for text in texts:
response = client.embeddings.create(
model=self.model,
input=text
)
response = client.embeddings.create(model=self.model, input=text)
embeddings.append(response.data[0].embedding)
return embeddings
@@ -207,7 +210,7 @@ class EmbeddingCache:
if cache_file.exists():
try:
data = json.loads(cache_file.read_text())
embedding = data['embedding']
embedding = data["embedding"]
self._memory_cache[cache_key] = embedding
return embedding
except Exception:
@@ -226,12 +229,16 @@ class EmbeddingCache:
if self.cache_dir:
cache_file = self.cache_dir / f"{cache_key}.json"
try:
cache_file.write_text(json.dumps({
'text_hash': cache_key,
'model': model,
'embedding': embedding,
'timestamp': time.time()
}))
cache_file.write_text(
json.dumps(
{
"text_hash": cache_key,
"model": model,
"embedding": embedding,
"timestamp": time.time(),
}
)
)
except Exception as e:
print(f"⚠️ Warning: Failed to write cache: {e}")
@@ -252,9 +259,9 @@ class EmbeddingPipeline:
def _create_provider(self) -> EmbeddingProvider:
"""Create provider based on config."""
if self.config.provider == 'openai':
if self.config.provider == "openai":
return OpenAIEmbeddingProvider(self.config.model)
elif self.config.provider == 'local':
elif self.config.provider == "local":
return LocalEmbeddingProvider(self.config.dimension)
else:
raise ValueError(f"Unknown provider: {self.config.provider}")
@@ -264,11 +271,7 @@ class EmbeddingPipeline:
# Rough estimate: 1 token ≈ 4 characters
return len(text) // 4
def generate_batch(
self,
texts: list[str],
show_progress: bool = True
) -> EmbeddingResult:
def generate_batch(self, texts: list[str], show_progress: bool = True) -> EmbeddingResult:
"""
Generate embeddings for batch of texts.
@@ -293,7 +296,7 @@ class EmbeddingPipeline:
# Process in batches
for i in range(0, len(texts), self.config.batch_size):
batch = texts[i:i + self.config.batch_size]
batch = texts[i : i + self.config.batch_size]
batch_embeddings = []
to_generate = []
to_generate_indices = []
@@ -331,7 +334,7 @@ class EmbeddingPipeline:
if show_progress and len(texts) > self.config.batch_size:
progress = min(i + self.config.batch_size, len(texts))
print(f" Progress: {progress}/{len(texts)} ({progress/len(texts)*100:.1f}%)")
print(f" Progress: {progress}/{len(texts)} ({progress / len(texts) * 100:.1f}%)")
total_time = time.time() - start_time
@@ -342,21 +345,21 @@ class EmbeddingPipeline:
print(f" Generated: {generated_count}")
print(f" Time: {total_time:.2f}s")
if self.config.provider != 'local':
if self.config.provider != "local":
stats = self.cost_tracker.get_stats()
print(f" Cost: {stats['estimated_cost']}")
return EmbeddingResult(
embeddings=embeddings,
metadata={
'provider': self.config.provider,
'model': self.config.model,
'dimension': self.provider.get_dimension()
"provider": self.config.provider,
"model": self.config.model,
"dimension": self.provider.get_dimension(),
},
cached_count=cached_count,
generated_count=generated_count,
total_time=total_time,
cost_estimate=self.cost_tracker.estimated_cost
cost_estimate=self.cost_tracker.estimated_cost,
)
def validate_dimensions(self, embeddings: list[list[float]]) -> bool:
@@ -373,8 +376,10 @@ class EmbeddingPipeline:
for i, embedding in enumerate(embeddings):
if len(embedding) != expected_dim:
print(f"❌ Dimension mismatch at index {i}: "
f"expected {expected_dim}, got {len(embedding)}")
print(
f"❌ Dimension mismatch at index {i}: "
f"expected {expected_dim}, got {len(embedding)}"
)
return False
return True
@@ -390,11 +395,11 @@ def example_usage():
# Configure pipeline
config = EmbeddingConfig(
provider='local', # Use 'openai' for production
model='text-embedding-ada-002',
provider="local", # Use 'openai' for production
model="text-embedding-ada-002",
dimension=384,
batch_size=50,
cache_dir=Path("output/.embeddings_cache")
cache_dir=Path("output/.embeddings_cache"),
)
# Initialize pipeline

View File

@@ -175,8 +175,7 @@ class LocalSkillEnhancer:
dangerous_chars = [";", "&", "|", "$", "`", "\n", "\r"]
if any(char in cmd_template for char in dangerous_chars):
raise ValueError(
"Custom command contains dangerous shell characters. "
f"Command: {cmd_template}"
f"Custom command contains dangerous shell characters. Command: {cmd_template}"
)
try:
@@ -888,9 +887,7 @@ rm {prompt_file}
print("❌ SKILL.md not found after enhancement")
return False
else:
print(
f"{self.agent_display} returned error (exit code: {result.returncode})"
)
print(f"{self.agent_display} returned error (exit code: {result.returncode})")
if result.stderr:
print(f" Error: {result.stderr[:200]}")
return False

View File

@@ -16,6 +16,7 @@ from datetime import datetime
@dataclass
class DocumentVersion:
"""Version information for a document."""
file_path: str
content_hash: str
size_bytes: int
@@ -26,6 +27,7 @@ class DocumentVersion:
@dataclass
class ChangeSet:
"""Set of changes detected."""
added: list[DocumentVersion]
modified: list[DocumentVersion]
deleted: list[str]
@@ -45,6 +47,7 @@ class ChangeSet:
@dataclass
class UpdateMetadata:
"""Metadata for an incremental update."""
timestamp: str
previous_version: str
new_version: str
@@ -86,7 +89,7 @@ class IncrementalUpdater:
sha256 = hashlib.sha256()
try:
with open(file_path, 'rb') as f:
with open(file_path, "rb") as f:
while chunk := f.read(8192):
sha256.update(chunk)
return sha256.hexdigest()
@@ -111,7 +114,7 @@ class IncrementalUpdater:
content_hash=self._compute_file_hash(skill_md),
size_bytes=skill_md.stat().st_size,
last_modified=skill_md.stat().st_mtime,
version=1
version=1,
)
# Scan references
@@ -125,7 +128,7 @@ class IncrementalUpdater:
content_hash=self._compute_file_hash(ref_file),
size_bytes=ref_file.stat().st_size,
last_modified=ref_file.stat().st_mtime,
version=1
version=1,
)
return versions
@@ -157,9 +160,8 @@ class IncrementalUpdater:
"timestamp": datetime.now().isoformat(),
"version": "1.0.0",
"documents": {
file_path: asdict(version)
for file_path, version in self.current_versions.items()
}
file_path: asdict(version) for file_path, version in self.current_versions.items()
},
}
self.version_file.write_text(json.dumps(data, indent=2))
@@ -180,10 +182,7 @@ class IncrementalUpdater:
if not has_previous:
# First time - all files are "added"
return ChangeSet(
added=list(self.current_versions.values()),
modified=[],
deleted=[],
unchanged=[]
added=list(self.current_versions.values()), modified=[], deleted=[], unchanged=[]
)
# Detect changes
@@ -215,18 +214,10 @@ class IncrementalUpdater:
else:
unchanged.append(current)
return ChangeSet(
added=added,
modified=modified,
deleted=deleted,
unchanged=unchanged
)
return ChangeSet(added=added, modified=modified, deleted=deleted, unchanged=unchanged)
def generate_update_package(
self,
change_set: ChangeSet,
output_path: Path,
include_content: bool = True
self, change_set: ChangeSet, output_path: Path, include_content: bool = True
) -> Path:
"""
Generate incremental update package.
@@ -250,11 +241,11 @@ class IncrementalUpdater:
"added": len(change_set.added),
"modified": len(change_set.modified),
"deleted": len(change_set.deleted),
"unchanged": len(change_set.unchanged)
"unchanged": len(change_set.unchanged),
},
"total_changes": change_set.total_changes
"total_changes": change_set.total_changes,
},
"changes": {}
"changes": {},
}
# Include changed documents
@@ -267,7 +258,7 @@ class IncrementalUpdater:
"version": doc.version,
"content": file_path.read_text(encoding="utf-8"),
"hash": doc.content_hash,
"size": doc.size_bytes
"size": doc.size_bytes,
}
# Modified documents
@@ -278,14 +269,12 @@ class IncrementalUpdater:
"version": doc.version,
"content": file_path.read_text(encoding="utf-8"),
"hash": doc.content_hash,
"size": doc.size_bytes
"size": doc.size_bytes,
}
# Deleted documents
for file_path in change_set.deleted:
update_data["changes"][file_path] = {
"action": "delete"
}
update_data["changes"][file_path] = {"action": "delete"}
# Write package
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -332,7 +321,9 @@ class IncrementalUpdater:
if prev:
size_diff = doc.size_bytes - prev.size_bytes
size_str = f"{size_diff:+,} bytes" if size_diff != 0 else "same size"
lines.append(f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})")
lines.append(
f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})"
)
else:
lines.append(f" ~ {doc.file_path} (v{doc.version})")
lines.append("")
@@ -473,4 +464,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -369,8 +369,6 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
(r"\$[0-9]+", 4),
(r"->", 3),
],
# ===== Markup/Config Languages =====
"html": [
(r"<!DOCTYPE\s+html>", 5),

View File

@@ -42,25 +42,25 @@ from skill_seekers.cli import __version__
# Command module mapping (command name -> module path)
COMMAND_MODULES = {
'config': 'skill_seekers.cli.config_command',
'scrape': 'skill_seekers.cli.doc_scraper',
'github': 'skill_seekers.cli.github_scraper',
'pdf': 'skill_seekers.cli.pdf_scraper',
'unified': 'skill_seekers.cli.unified_scraper',
'enhance': 'skill_seekers.cli.enhance_skill_local',
'enhance-status': 'skill_seekers.cli.enhance_status',
'package': 'skill_seekers.cli.package_skill',
'upload': 'skill_seekers.cli.upload_skill',
'estimate': 'skill_seekers.cli.estimate_pages',
'extract-test-examples': 'skill_seekers.cli.test_example_extractor',
'install-agent': 'skill_seekers.cli.install_agent',
'analyze': 'skill_seekers.cli.codebase_scraper',
'install': 'skill_seekers.cli.install_skill',
'resume': 'skill_seekers.cli.resume_command',
'stream': 'skill_seekers.cli.streaming_ingest',
'update': 'skill_seekers.cli.incremental_updater',
'multilang': 'skill_seekers.cli.multilang_support',
'quality': 'skill_seekers.cli.quality_metrics',
"config": "skill_seekers.cli.config_command",
"scrape": "skill_seekers.cli.doc_scraper",
"github": "skill_seekers.cli.github_scraper",
"pdf": "skill_seekers.cli.pdf_scraper",
"unified": "skill_seekers.cli.unified_scraper",
"enhance": "skill_seekers.cli.enhance_skill_local",
"enhance-status": "skill_seekers.cli.enhance_status",
"package": "skill_seekers.cli.package_skill",
"upload": "skill_seekers.cli.upload_skill",
"estimate": "skill_seekers.cli.estimate_pages",
"extract-test-examples": "skill_seekers.cli.test_example_extractor",
"install-agent": "skill_seekers.cli.install_agent",
"analyze": "skill_seekers.cli.codebase_scraper",
"install": "skill_seekers.cli.install_skill",
"resume": "skill_seekers.cli.resume_command",
"stream": "skill_seekers.cli.streaming_ingest",
"update": "skill_seekers.cli.incremental_updater",
"multilang": "skill_seekers.cli.multilang_support",
"quality": "skill_seekers.cli.quality_metrics",
}
@@ -124,12 +124,21 @@ def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]:
# Convert args to sys.argv format
for key, value in vars(args).items():
if key == 'command':
if key == "command":
continue
# Handle positional arguments (no -- prefix)
if key in ['url', 'directory', 'file', 'job_id', 'skill_directory', 'zip_file', 'config', 'input_file']:
if value is not None and value != '':
if key in [
"url",
"directory",
"file",
"job_id",
"skill_directory",
"zip_file",
"config",
"input_file",
]:
if value is not None and value != "":
argv.append(str(value))
continue
@@ -172,7 +181,7 @@ def main(argv: list[str] | None = None) -> int:
return 1
# Special handling for 'analyze' command (has post-processing)
if args.command == 'analyze':
if args.command == "analyze":
return _handle_analyze_command(args)
# Standard delegation for all other commands
@@ -200,6 +209,7 @@ def main(argv: list[str] | None = None) -> int:
# Show traceback in verbose mode
import traceback
if hasattr(args, "verbose") and getattr(args, "verbose", False):
traceback.print_exc()
@@ -226,13 +236,16 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
# Handle preset flags (depth and features)
if args.quick:
sys.argv.extend([
"--depth", "surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
])
sys.argv.extend(
[
"--depth",
"surface",
"--skip-patterns",
"--skip-test-examples",
"--skip-how-to-guides",
"--skip-config-patterns",
]
)
elif args.comprehensive:
sys.argv.extend(["--depth", "full"])
elif args.depth:
@@ -246,6 +259,7 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
elif args.enhance:
try:
from skill_seekers.cli.config_manager import get_config_manager
config = get_config_manager()
enhance_level = config.get_default_enhance_level()
except Exception:

View File

@@ -15,6 +15,7 @@ import json
@dataclass
class LanguageInfo:
"""Language information for a document."""
code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh')
name: str # Full name (e.g., 'English', 'Spanish', 'Chinese')
confidence: float # Detection confidence (0.0-1.0)
@@ -24,6 +25,7 @@ class LanguageInfo:
@dataclass
class TranslationStatus:
"""Translation status for a document."""
source_language: str
target_languages: list[str]
translated_languages: set[str]
@@ -40,74 +42,81 @@ class LanguageDetector:
# Common word patterns by language
LANGUAGE_PATTERNS = {
'en': [
r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b',
r'\b(this|that|these|those|what|which|who|where|when)\b',
"en": [
r"\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b",
r"\b(this|that|these|those|what|which|who|where|when)\b",
],
'es': [
r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b',
r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b',
"es": [
r"\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b",
r"\b(que|no|un|una|como|más|pero|muy|todo|ya)\b",
],
'fr': [
r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b',
r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b',
"fr": [
r"\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b",
r"\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b",
],
'de': [
r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b',
r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b',
"de": [
r"\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b",
r"\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b",
],
'zh': [
r'[\u4e00-\u9fff]', # Chinese characters
r'(的|了|和|是|在|有|我|他|不|这)',
"zh": [
r"[\u4e00-\u9fff]", # Chinese characters
r"(的|了|和|是|在|有|我|他|不|这)",
],
'ja': [
r'[\u3040-\u309f]', # Hiragana
r'[\u30a0-\u30ff]', # Katakana
r'[\u4e00-\u9faf]', # Kanji
"ja": [
r"[\u3040-\u309f]", # Hiragana
r"[\u30a0-\u30ff]", # Katakana
r"[\u4e00-\u9faf]", # Kanji
],
'ko': [
r'[\uac00-\ud7af]', # Hangul
r'(의|가|이|은|들|는|좀|잘|께|을)',
"ko": [
r"[\uac00-\ud7af]", # Hangul
r"(의|가|이|은|들|는|좀|잘|께|을)",
],
'ru': [
r'[\u0400-\u04ff]', # Cyrillic
r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b',
"ru": [
r"[\u0400-\u04ff]", # Cyrillic
r"\b(и|в|не|на|с|что|он|по|а|как|это|все)\b",
],
'pt': [
r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b',
r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b',
"pt": [
r"\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b",
r"\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b",
],
'it': [
r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b',
r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b',
"it": [
r"\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b",
r"\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b",
],
'ar': [
r'[\u0600-\u06ff]', # Arabic
r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)',
"ar": [
r"[\u0600-\u06ff]", # Arabic
r"(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)",
],
}
# Language names
LANGUAGE_NAMES = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'zh': 'Chinese',
'ja': 'Japanese',
'ko': 'Korean',
'ru': 'Russian',
'pt': 'Portuguese',
'it': 'Italian',
'ar': 'Arabic',
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"zh": "Chinese",
"ja": "Japanese",
"ko": "Korean",
"ru": "Russian",
"pt": "Portuguese",
"it": "Italian",
"ar": "Arabic",
}
# Script types
SCRIPTS = {
'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin',
'pt': 'Latin', 'it': 'Latin',
'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul',
'ru': 'Cyrillic', 'ar': 'Arabic',
"en": "Latin",
"es": "Latin",
"fr": "Latin",
"de": "Latin",
"pt": "Latin",
"it": "Latin",
"zh": "Han",
"ja": "Japanese",
"ko": "Hangul",
"ru": "Cyrillic",
"ar": "Arabic",
}
def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo:
@@ -122,7 +131,7 @@ class LanguageDetector:
LanguageInfo with detected language
"""
if not text.strip():
return LanguageInfo('en', 'English', 0.0)
return LanguageInfo("en", "English", 0.0)
# Sample text for efficiency
sample = text[:sample_size].lower()
@@ -140,7 +149,7 @@ class LanguageDetector:
# Find best match
if not scores or max(scores.values()) == 0:
# Default to English
return LanguageInfo('en', 'English', 0.1)
return LanguageInfo("en", "English", 0.1)
best_lang = max(scores, key=scores.get)
total_score = sum(scores.values())
@@ -150,7 +159,7 @@ class LanguageDetector:
code=best_lang,
name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()),
confidence=min(confidence, 1.0),
script=self.SCRIPTS.get(best_lang)
script=self.SCRIPTS.get(best_lang),
)
def detect_from_filename(self, filename: str) -> str | None:
@@ -170,12 +179,12 @@ class LanguageDetector:
ISO 639-1 language code or None
"""
# Pattern: file.en.md
match = re.search(r'\.([a-z]{2})\.md$', filename)
match = re.search(r"\.([a-z]{2})\.md$", filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
# Pattern: file_en.md or file-en.md
match = re.search(r'[_-]([a-z]{2})\.md$', filename)
match = re.search(r"[_-]([a-z]{2})\.md$", filename)
if match and match.group(1) in self.LANGUAGE_NAMES:
return match.group(1)
@@ -200,7 +209,7 @@ class MultiLanguageManager:
file_path: str,
content: str,
metadata: dict | None = None,
force_language: str | None = None
force_language: str | None = None,
) -> None:
"""
Add document with language detection.
@@ -218,7 +227,7 @@ class MultiLanguageManager:
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=1.0,
script=self.detector.SCRIPTS.get(lang_code)
script=self.detector.SCRIPTS.get(lang_code),
)
else:
# Try filename pattern first
@@ -229,7 +238,7 @@ class MultiLanguageManager:
code=lang_code,
name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
confidence=0.95,
script=self.detector.SCRIPTS.get(lang_code)
script=self.detector.SCRIPTS.get(lang_code),
)
else:
# Detect from content
@@ -245,13 +254,13 @@ class MultiLanguageManager:
self.documents[lang_code] = []
doc = {
'file_path': file_path,
'content': content,
'language': lang_info.code,
'language_name': lang_info.name,
'confidence': lang_info.confidence,
'script': lang_info.script,
'metadata': metadata or {}
"file_path": file_path,
"content": content,
"language": lang_info.code,
"language_name": lang_info.name,
"confidence": lang_info.confidence,
"script": lang_info.script,
"metadata": metadata or {},
}
self.documents[lang_code].append(doc)
@@ -284,7 +293,7 @@ class MultiLanguageManager:
Returns:
Translation status summary
"""
base_lang = base_language or self.primary_language or 'en'
base_lang = base_language or self.primary_language or "en"
all_languages = set(self.documents.keys())
base_count = self.get_document_count(base_lang)
@@ -295,7 +304,7 @@ class MultiLanguageManager:
target_languages=[],
translated_languages=set(),
missing_languages=set(),
completeness=0.0
completeness=0.0,
)
# Check which languages have translations
@@ -305,7 +314,7 @@ class MultiLanguageManager:
translated.add(lang)
# Commonly expected languages for completeness
expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'}
expected_languages = {"en", "es", "fr", "de", "zh", "ja"}
missing = expected_languages - all_languages
completeness = len(all_languages) / len(expected_languages)
@@ -315,7 +324,7 @@ class MultiLanguageManager:
target_languages=list(all_languages - {base_lang}),
translated_languages=translated,
missing_languages=missing,
completeness=min(completeness, 1.0)
completeness=min(completeness, 1.0),
)
def export_by_language(self, output_dir: Path) -> dict[str, Path]:
@@ -337,10 +346,10 @@ class MultiLanguageManager:
lang_file = output_dir / f"documents_{lang_code}.json"
export_data = {
'language': lang_code,
'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
'document_count': len(docs),
'documents': docs
"language": lang_code,
"language_name": self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()),
"document_count": len(docs),
"documents": docs,
}
lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False))
@@ -419,9 +428,7 @@ def main():
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
manager.add_document(
"SKILL.md",
skill_md.read_text(encoding="utf-8"),
{"category": "overview"}
"SKILL.md", skill_md.read_text(encoding="utf-8"), {"category": "overview"}
)
# Load reference files
@@ -429,9 +436,7 @@ def main():
if refs_dir.exists():
for ref_file in refs_dir.glob("*.md"):
manager.add_document(
ref_file.name,
ref_file.read_text(encoding="utf-8"),
{"category": ref_file.stem}
ref_file.name, ref_file.read_text(encoding="utf-8"), {"category": ref_file.stem}
)
# Detect languages
@@ -460,4 +465,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -113,7 +113,15 @@ def package_skill(
output_dir = skill_path.parent
# Auto-enable chunking for RAG platforms
RAG_PLATFORMS = ['langchain', 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant']
RAG_PLATFORMS = [
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
]
if target in RAG_PLATFORMS and not enable_chunking:
print(f" Auto-enabling chunking for {target} platform")
@@ -126,17 +134,19 @@ def package_skill(
if streaming:
print(f" Mode: Streaming (chunk_size={chunk_size}, overlap={chunk_overlap})")
elif enable_chunking:
print(f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})")
print(
f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})"
)
try:
# Use streaming if requested and supported
if streaming and hasattr(adaptor, 'package_streaming'):
if streaming and hasattr(adaptor, "package_streaming"):
package_path = adaptor.package_streaming(
skill_path,
output_dir,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size
batch_size=batch_size,
)
elif streaming:
print("⚠️ Streaming not supported for this platform, using standard packaging")
@@ -145,7 +155,7 @@ def package_skill(
output_dir,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
else:
package_path = adaptor.package(
@@ -153,7 +163,7 @@ def package_skill(
output_dir,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks
preserve_code_blocks=preserve_code_blocks,
)
print(f" Output: {package_path}")
@@ -212,7 +222,19 @@ Examples:
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "haystack", "weaviate", "chroma", "faiss", "qdrant"],
choices=[
"claude",
"gemini",
"openai",
"markdown",
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
],
default="claude",
help="Target LLM platform (default: claude)",
)

View File

@@ -3,6 +3,7 @@
This module registers all subcommand parsers and provides a factory
function to create them.
"""
from .base import SubcommandParser
# Import all parser classes

View File

@@ -1,4 +1,5 @@
"""Analyze subcommand parser."""
from .base import SubcommandParser
@@ -21,26 +22,26 @@ class AnalyzeParser(SubcommandParser):
"""Add analyze-specific arguments."""
parser.add_argument("--directory", required=True, help="Directory to analyze")
parser.add_argument(
"--output", default="output/codebase/", help="Output directory (default: output/codebase/)"
"--output",
default="output/codebase/",
help="Output directory (default: output/codebase/)",
)
# Preset selection (NEW - recommended way)
parser.add_argument(
"--preset",
choices=["quick", "standard", "comprehensive"],
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)"
help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)",
)
parser.add_argument(
"--preset-list",
action="store_true",
help="Show available presets and exit"
"--preset-list", action="store_true", help="Show available presets and exit"
)
# Legacy preset flags (kept for backward compatibility)
parser.add_argument(
"--quick",
action="store_true",
help="[DEPRECATED] Quick analysis - use '--preset quick' instead"
help="[DEPRECATED] Quick analysis - use '--preset quick' instead",
)
parser.add_argument(
"--comprehensive",
@@ -71,15 +72,9 @@ class AnalyzeParser(SubcommandParser):
help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full",
)
parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs")
parser.add_argument(
"--skip-dependency-graph", action="store_true", help="Skip dep graph"
)
parser.add_argument(
"--skip-patterns", action="store_true", help="Skip pattern detection"
)
parser.add_argument(
"--skip-test-examples", action="store_true", help="Skip test examples"
)
parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph")
parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection")
parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples")
parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides")
parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config")
parser.add_argument(

View File

@@ -1,4 +1,5 @@
"""Base parser class for subcommands."""
from abc import ABC, abstractmethod
import argparse
@@ -48,10 +49,6 @@ class SubcommandParser(ABC):
Returns:
Configured ArgumentParser for this subcommand
"""
parser = subparsers.add_parser(
self.name,
help=self.help,
description=self.description
)
parser = subparsers.add_parser(self.name, help=self.help, description=self.description)
self.add_arguments(parser)
return parser

View File

@@ -1,4 +1,5 @@
"""Config subcommand parser."""
from .base import SubcommandParser
@@ -22,9 +23,7 @@ class ConfigParser(SubcommandParser):
parser.add_argument(
"--github", action="store_true", help="Go directly to GitHub token setup"
)
parser.add_argument(
"--api-keys", action="store_true", help="Go directly to API keys setup"
)
parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup")
parser.add_argument(
"--show", action="store_true", help="Show current configuration and exit"
)

View File

@@ -1,4 +1,5 @@
"""Enhance subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Enhance-status subcommand parser."""
from .base import SubcommandParser
@@ -20,10 +21,6 @@ class EnhanceStatusParser(SubcommandParser):
def add_arguments(self, parser):
"""Add enhance-status-specific arguments."""
parser.add_argument("skill_directory", help="Skill directory path")
parser.add_argument(
"--watch", "-w", action="store_true", help="Watch in real-time"
)
parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time")
parser.add_argument("--json", action="store_true", help="JSON output")
parser.add_argument(
"--interval", type=int, default=2, help="Watch interval in seconds"
)
parser.add_argument("--interval", type=int, default=2, help="Watch interval in seconds")

View File

@@ -1,4 +1,5 @@
"""Estimate subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""GitHub subcommand parser."""
from .base import SubcommandParser
@@ -24,9 +25,7 @@ class GitHubParser(SubcommandParser):
parser.add_argument("--name", help="Skill name")
parser.add_argument("--description", help="Skill description")
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
parser.add_argument(
"--enhance-local", action="store_true", help="AI enhancement (local)"
)
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance")
parser.add_argument(
"--non-interactive",

View File

@@ -1,4 +1,5 @@
"""Install-agent subcommand parser."""
from .base import SubcommandParser
@@ -19,9 +20,7 @@ class InstallAgentParser(SubcommandParser):
def add_arguments(self, parser):
"""Add install-agent-specific arguments."""
parser.add_argument(
"skill_directory", help="Skill directory path (e.g., output/react/)"
)
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
parser.add_argument(
"--agent",
required=True,

View File

@@ -1,4 +1,5 @@
"""Install subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Multilang subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Package subcommand parser."""
from .base import SubcommandParser
@@ -20,27 +21,72 @@ class PackageParser(SubcommandParser):
def add_arguments(self, parser):
"""Add package-specific arguments."""
parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)")
parser.add_argument("--no-open", action="store_true", help="Don't open output folder after packaging")
parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging")
parser.add_argument(
"--no-open", action="store_true", help="Don't open output folder after packaging"
)
parser.add_argument(
"--skip-quality-check", action="store_true", help="Skip quality checks before packaging"
)
parser.add_argument(
"--target",
choices=[
"claude", "gemini", "openai", "markdown",
"langchain", "llama-index", "haystack",
"weaviate", "chroma", "faiss", "qdrant"
"claude",
"gemini",
"openai",
"markdown",
"langchain",
"llama-index",
"haystack",
"weaviate",
"chroma",
"faiss",
"qdrant",
],
default="claude",
help="Target LLM platform (default: claude)",
)
parser.add_argument("--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)")
parser.add_argument(
"--upload",
action="store_true",
help="Automatically upload after packaging (requires platform API key)",
)
# Streaming options
parser.add_argument("--streaming", action="store_true", help="Use streaming ingestion for large docs (memory-efficient)")
parser.add_argument("--chunk-size", type=int, default=4000, help="Maximum characters per chunk (streaming mode, default: 4000)")
parser.add_argument("--chunk-overlap", type=int, default=200, help="Overlap between chunks (streaming mode, default: 200)")
parser.add_argument("--batch-size", type=int, default=100, help="Number of chunks per batch (streaming mode, default: 100)")
parser.add_argument(
"--streaming",
action="store_true",
help="Use streaming ingestion for large docs (memory-efficient)",
)
parser.add_argument(
"--chunk-size",
type=int,
default=4000,
help="Maximum characters per chunk (streaming mode, default: 4000)",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=200,
help="Overlap between chunks (streaming mode, default: 200)",
)
parser.add_argument(
"--batch-size",
type=int,
default=100,
help="Number of chunks per batch (streaming mode, default: 100)",
)
# RAG chunking options
parser.add_argument("--chunk", action="store_true", help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)")
parser.add_argument("--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)")
parser.add_argument("--no-preserve-code", action="store_true", help="Allow code block splitting (default: code blocks preserved)")
parser.add_argument(
"--chunk",
action="store_true",
help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)",
)
parser.add_argument(
"--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)"
)
parser.add_argument(
"--no-preserve-code",
action="store_true",
help="Allow code block splitting (default: code blocks preserved)",
)

View File

@@ -1,4 +1,5 @@
"""PDF subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Quality subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Resume subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Scrape subcommand parser."""
from .base import SubcommandParser
@@ -24,15 +25,16 @@ class ScrapeParser(SubcommandParser):
parser.add_argument("--name", help="Skill name")
parser.add_argument("--description", help="Skill description")
parser.add_argument(
"--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)"
"--max-pages",
type=int,
dest="max_pages",
help="Maximum pages to scrape (override config)",
)
parser.add_argument(
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
)
parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)")
parser.add_argument(
"--enhance-local", action="store_true", help="AI enhancement (local)"
)
parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)")
parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
parser.add_argument(
"--async", dest="async_mode", action="store_true", help="Use async scraping"

View File

@@ -1,4 +1,5 @@
"""Stream subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Extract-test-examples subcommand parser."""
from .base import SubcommandParser
@@ -19,9 +20,7 @@ class TestExamplesParser(SubcommandParser):
def add_arguments(self, parser):
"""Add extract-test-examples-specific arguments."""
parser.add_argument(
"directory", nargs="?", help="Directory containing test files"
)
parser.add_argument("directory", nargs="?", help="Directory containing test files")
parser.add_argument("--file", help="Single test file to analyze")
parser.add_argument(
"--language", help="Filter by programming language (python, javascript, etc.)"
@@ -36,6 +35,4 @@ class TestExamplesParser(SubcommandParser):
"--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)"
)
parser.add_argument("--json", action="store_true", help="Output JSON format")
parser.add_argument(
"--markdown", action="store_true", help="Output Markdown format"
)
parser.add_argument("--markdown", action="store_true", help="Output Markdown format")

View File

@@ -1,4 +1,5 @@
"""Unified subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Update subcommand parser."""
from .base import SubcommandParser

View File

@@ -1,4 +1,5 @@
"""Upload subcommand parser."""
from .base import SubcommandParser
@@ -19,7 +20,9 @@ class UploadParser(SubcommandParser):
def add_arguments(self, parser):
"""Add upload-specific arguments."""
parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)")
parser.add_argument(
"package_file", help="Path to skill package file (e.g., output/react.zip)"
)
parser.add_argument(
"--target",
@@ -33,22 +36,34 @@ class UploadParser(SubcommandParser):
# ChromaDB upload options
parser.add_argument(
"--chroma-url",
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
)
parser.add_argument(
"--persist-directory",
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
)
# Embedding options
parser.add_argument(
"--embedding-function",
choices=["openai", "sentence-transformers", "none"],
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
)
parser.add_argument(
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
)
parser.add_argument("--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)")
# Weaviate upload options
parser.add_argument("--weaviate-url", default="http://localhost:8080", help="Weaviate URL (default: http://localhost:8080)")
parser.add_argument("--use-cloud", action="store_true", help="Use Weaviate Cloud (requires --api-key and --cluster-url)")
parser.add_argument("--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)")
parser.add_argument(
"--weaviate-url",
default="http://localhost:8080",
help="Weaviate URL (default: http://localhost:8080)",
)
parser.add_argument(
"--use-cloud",
action="store_true",
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
)
parser.add_argument(
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
)

View File

@@ -30,14 +30,14 @@ logger = logging.getLogger(__name__)
# Confidence thresholds for pattern filtering (Issue #240)
CONFIDENCE_THRESHOLDS = {
'critical': 0.80, # High-confidence patterns for ARCHITECTURE.md
'high': 0.70, # Include in detailed analysis
'medium': 0.60, # Include with warning/context
'low': 0.50, # Minimum detection threshold
"critical": 0.80, # High-confidence patterns for ARCHITECTURE.md
"high": 0.70, # Include in detailed analysis
"medium": 0.60, # Include with warning/context
"low": 0.50, # Minimum detection threshold
}
# Default minimum confidence for pattern detection
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS['low']
DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS["low"]
@dataclass
@@ -1697,9 +1697,11 @@ def create_multi_level_report(pattern_results: list[dict]) -> dict:
all_patterns_sorted = sorted(all_patterns, key=lambda p: p.get("confidence", 0.0), reverse=True)
# Filter by confidence levels
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['critical'])
high_confidence = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['high'])
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['medium'])
critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["critical"])
high_confidence = filter_patterns_by_confidence(
all_patterns_sorted, CONFIDENCE_THRESHOLDS["high"]
)
medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["medium"])
return {
"all_patterns": all_patterns_sorted,

View File

@@ -3,6 +3,7 @@
Provides predefined analysis configurations with clear trade-offs
between speed and comprehensiveness.
"""
from dataclasses import dataclass
@@ -13,6 +14,7 @@ class AnalysisPreset:
Defines a complete analysis configuration including depth,
feature flags, and AI enhancement level.
"""
name: str
description: str
depth: str # surface, deep, full
@@ -29,54 +31,52 @@ PRESETS = {
description="Fast basic analysis (1-2 min, essential features only)",
depth="surface",
features={
"api_reference": True, # ON - Essential for API docs
"api_reference": True, # ON - Essential for API docs
"dependency_graph": False, # OFF - Slow, not critical for quick
"patterns": False, # OFF - Slow pattern detection
"test_examples": False, # OFF - Time-consuming extraction
"how_to_guides": False, # OFF - Requires AI enhancement
"config_patterns": False, # OFF - Not critical for quick scan
"docs": True, # ON - README/docs are essential
"patterns": False, # OFF - Slow pattern detection
"test_examples": False, # OFF - Time-consuming extraction
"how_to_guides": False, # OFF - Requires AI enhancement
"config_patterns": False, # OFF - Not critical for quick scan
"docs": True, # ON - README/docs are essential
},
enhance_level=0, # No AI enhancement (fast)
estimated_time="1-2 minutes",
icon=""
icon="",
),
"standard": AnalysisPreset(
name="Standard",
description="Balanced analysis (5-10 min, core features, DEFAULT)",
depth="deep",
features={
"api_reference": True, # ON - Core feature
"dependency_graph": True, # ON - Valuable insights
"patterns": True, # ON - Design pattern detection
"test_examples": True, # ON - Real usage examples
"how_to_guides": False, # OFF - Requires AI (slow)
"config_patterns": True, # ON - Configuration docs
"docs": True, # ON - Project documentation
"api_reference": True, # ON - Core feature
"dependency_graph": True, # ON - Valuable insights
"patterns": True, # ON - Design pattern detection
"test_examples": True, # ON - Real usage examples
"how_to_guides": False, # OFF - Requires AI (slow)
"config_patterns": True, # ON - Configuration docs
"docs": True, # ON - Project documentation
},
enhance_level=1, # SKILL.md enhancement only
estimated_time="5-10 minutes",
icon="🎯"
icon="🎯",
),
"comprehensive": AnalysisPreset(
name="Comprehensive",
description="Full analysis (20-60 min, all features + AI)",
depth="full",
features={
"api_reference": True, # ON - Complete API docs
"dependency_graph": True, # ON - Full dependency analysis
"patterns": True, # ON - All design patterns
"test_examples": True, # ON - All test examples
"how_to_guides": True, # ON - AI-generated guides
"config_patterns": True, # ON - All configuration patterns
"docs": True, # ON - All project docs
"api_reference": True, # ON - Complete API docs
"dependency_graph": True, # ON - Full dependency analysis
"patterns": True, # ON - All design patterns
"test_examples": True, # ON - All test examples
"how_to_guides": True, # ON - AI-generated guides
"config_patterns": True, # ON - All configuration patterns
"docs": True, # ON - All project docs
},
enhance_level=3, # Full AI enhancement (all features)
estimated_time="20-60 minutes",
icon="🚀"
)
icon="🚀",
),
}
@@ -142,10 +142,7 @@ class PresetManager:
raise ValueError(f"Unknown preset: {preset_name}")
# Start with preset defaults
updated_args = {
'depth': preset.depth,
'enhance_level': preset.enhance_level
}
updated_args = {"depth": preset.depth, "enhance_level": preset.enhance_level}
# Convert feature flags to skip_* arguments
# feature=False → skip_feature=True (disabled)

View File

@@ -16,6 +16,7 @@ from enum import Enum
class MetricLevel(Enum):
"""Metric severity level."""
INFO = "info"
WARNING = "warning"
ERROR = "error"
@@ -25,6 +26,7 @@ class MetricLevel(Enum):
@dataclass
class QualityMetric:
"""Individual quality metric."""
name: str
value: float # 0.0-1.0 (or 0-100 percentage)
level: MetricLevel
@@ -35,6 +37,7 @@ class QualityMetric:
@dataclass
class QualityScore:
"""Overall quality score."""
total_score: float # 0-100
completeness: float # 0-100
accuracy: float # 0-100
@@ -46,6 +49,7 @@ class QualityScore:
@dataclass
class QualityReport:
"""Complete quality report."""
timestamp: str
skill_name: str
overall_score: QualityScore
@@ -64,10 +68,17 @@ class QualityAnalyzer:
# Thresholds for quality grades
GRADE_THRESHOLDS = {
'A+': 95, 'A': 90, 'A-': 85,
'B+': 80, 'B': 75, 'B-': 70,
'C+': 65, 'C': 60, 'C-': 55,
'D': 50, 'F': 0
"A+": 95,
"A": 90,
"A-": 85,
"B+": 80,
"B": 75,
"B-": 70,
"C+": 65,
"C": 60,
"C-": 55,
"D": 50,
"F": 0,
}
def __init__(self, skill_dir: Path):
@@ -102,7 +113,7 @@ class QualityAnalyzer:
score += 10
# Has sections (10 points)
if content.count('#') >= 5:
if content.count("#") >= 5:
score += 10
# References directory (20 points)
@@ -134,13 +145,15 @@ class QualityAnalyzer:
if len(suggestions) == 0:
suggestions.append("Expand documentation coverage")
self.metrics.append(QualityMetric(
name="Completeness",
value=completeness,
level=level,
description=f"Documentation completeness: {completeness:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Completeness",
value=completeness,
level=level,
description=f"Documentation completeness: {completeness:.1f}%",
suggestions=suggestions,
)
)
return completeness
@@ -166,14 +179,14 @@ class QualityAnalyzer:
content = skill_md.read_text(encoding="utf-8")
# Check for TODO markers (deduct 5 points each, max 20)
todo_count = content.lower().count('todo')
todo_count = content.lower().count("todo")
if todo_count > 0:
deduction = min(todo_count * 5, 20)
score -= deduction
issues.append(f"Found {todo_count} TODO markers")
# Check for placeholder text (deduct 10)
placeholders = ['lorem ipsum', 'placeholder', 'coming soon']
placeholders = ["lorem ipsum", "placeholder", "coming soon"]
for placeholder in placeholders:
if placeholder in content.lower():
score -= 10
@@ -195,13 +208,15 @@ class QualityAnalyzer:
if accuracy < 100 and issues:
suggestions.extend(issues[:3]) # Top 3 issues
self.metrics.append(QualityMetric(
name="Accuracy",
value=accuracy,
level=level,
description=f"Documentation accuracy: {accuracy:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Accuracy",
value=accuracy,
level=level,
description=f"Documentation accuracy: {accuracy:.1f}%",
suggestions=suggestions,
)
)
return accuracy
@@ -234,13 +249,13 @@ class QualityAnalyzer:
# Check for specific types (20 points each)
ref_names = [f.stem.lower() for f in ref_files]
if any('getting' in name or 'start' in name for name in ref_names):
if any("getting" in name or "start" in name for name in ref_names):
score += 20
if any('api' in name or 'reference' in name for name in ref_names):
if any("api" in name or "reference" in name for name in ref_names):
score += 20
if any('example' in name or 'tutorial' in name for name in ref_names):
if any("example" in name or "tutorial" in name for name in ref_names):
score += 20
# Has diverse content (10 points)
@@ -258,13 +273,15 @@ class QualityAnalyzer:
suggestions.append("Add API reference documentation")
suggestions.append("Expand documentation coverage")
self.metrics.append(QualityMetric(
name="Coverage",
value=coverage,
level=level,
description=f"Documentation coverage: {coverage:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Coverage",
value=coverage,
level=level,
description=f"Documentation coverage: {coverage:.1f}%",
suggestions=suggestions,
)
)
return coverage
@@ -308,56 +325,54 @@ class QualityAnalyzer:
if health < 100:
suggestions.extend(issues[:3])
self.metrics.append(QualityMetric(
name="Health",
value=health,
level=level,
description=f"Skill health: {health:.1f}%",
suggestions=suggestions
))
self.metrics.append(
QualityMetric(
name="Health",
value=health,
level=level,
description=f"Skill health: {health:.1f}%",
suggestions=suggestions,
)
)
return health
def calculate_statistics(self) -> dict[str, Any]:
"""Calculate skill statistics."""
stats = {
'total_files': 0,
'total_size_bytes': 0,
'markdown_files': 0,
'reference_files': 0,
'total_characters': 0,
'total_words': 0
"total_files": 0,
"total_size_bytes": 0,
"markdown_files": 0,
"reference_files": 0,
"total_characters": 0,
"total_words": 0,
}
# Count files and sizes
for md_file in self.skill_dir.rglob("*.md"):
stats['total_files'] += 1
stats['markdown_files'] += 1
stats["total_files"] += 1
stats["markdown_files"] += 1
size = md_file.stat().st_size
stats['total_size_bytes'] += size
stats["total_size_bytes"] += size
# Count words
try:
content = md_file.read_text(encoding="utf-8")
stats['total_characters'] += len(content)
stats['total_words'] += len(content.split())
stats["total_characters"] += len(content)
stats["total_words"] += len(content.split())
except Exception:
pass
# Count references
refs_dir = self.skill_dir / "references"
if refs_dir.exists():
stats['reference_files'] = len(list(refs_dir.glob("*.md")))
stats["reference_files"] = len(list(refs_dir.glob("*.md")))
self.statistics = stats
return stats
def calculate_overall_score(
self,
completeness: float,
accuracy: float,
coverage: float,
health: float
self, completeness: float, accuracy: float, coverage: float, health: float
) -> QualityScore:
"""
Calculate overall quality score.
@@ -368,15 +383,10 @@ class QualityAnalyzer:
- Coverage: 25%
- Health: 20%
"""
total = (
completeness * 0.30 +
accuracy * 0.25 +
coverage * 0.25 +
health * 0.20
)
total = completeness * 0.30 + accuracy * 0.25 + coverage * 0.25 + health * 0.20
# Determine grade
grade = 'F'
grade = "F"
for g, threshold in self.GRADE_THRESHOLDS.items():
if total >= threshold:
grade = g
@@ -388,7 +398,7 @@ class QualityAnalyzer:
accuracy=accuracy,
coverage=coverage,
health=health,
grade=grade
grade=grade,
)
def generate_recommendations(self, score: QualityScore) -> list[str]:
@@ -431,9 +441,7 @@ class QualityAnalyzer:
health = self.analyze_health()
# Calculate overall score
overall_score = self.calculate_overall_score(
completeness, accuracy, coverage, health
)
overall_score = self.calculate_overall_score(completeness, accuracy, coverage, health)
# Calculate statistics
stats = self.calculate_statistics()
@@ -447,7 +455,7 @@ class QualityAnalyzer:
overall_score=overall_score,
metrics=self.metrics,
statistics=stats,
recommendations=recommendations
recommendations=recommendations,
)
def format_report(self, report: QualityReport) -> str:
@@ -484,7 +492,7 @@ class QualityAnalyzer:
MetricLevel.INFO: "",
MetricLevel.WARNING: "⚠️",
MetricLevel.ERROR: "",
MetricLevel.CRITICAL: "🔴"
MetricLevel.CRITICAL: "🔴",
}.get(metric.level, "")
lines.append(f" {icon} {metric.name}: {metric.value:.1f}%")
@@ -553,4 +561,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -75,10 +75,7 @@ class RAGChunker:
return len(text) // self.chars_per_token
def chunk_document(
self,
text: str,
metadata: dict,
source_file: str | None = None
self, text: str, metadata: dict, source_file: str | None = None
) -> list[dict]:
"""
Chunk single document into RAG-ready chunks.
@@ -125,11 +122,13 @@ class RAGChunker:
if source_file:
chunk_metadata["source_file"] = source_file
result.append({
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata
})
result.append(
{
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata,
}
)
logger.info(
f"Created {len(result)} chunks from {source_file or 'document'} "
@@ -153,14 +152,10 @@ class RAGChunker:
# Chunk main SKILL.md
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
with open(skill_md, encoding='utf-8') as f:
with open(skill_md, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": "overview",
"file_type": "skill_md"
}
metadata = {"source": skill_dir.name, "category": "overview", "file_type": "skill_md"}
chunks = self.chunk_document(content, metadata, source_file="SKILL.md")
all_chunks.extend(chunks)
@@ -169,26 +164,21 @@ class RAGChunker:
references_dir = skill_dir / "references"
if references_dir.exists():
for ref_file in references_dir.glob("*.md"):
with open(ref_file, encoding='utf-8') as f:
with open(ref_file, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": ref_file.stem,
"file_type": "reference"
"file_type": "reference",
}
chunks = self.chunk_document(
content,
metadata,
source_file=str(ref_file.relative_to(skill_dir))
content, metadata, source_file=str(ref_file.relative_to(skill_dir))
)
all_chunks.extend(chunks)
logger.info(
f"Chunked skill directory {skill_dir.name}: "
f"{len(all_chunks)} total chunks"
)
logger.info(f"Chunked skill directory {skill_dir.name}: {len(all_chunks)} total chunks")
return all_chunks
@@ -207,32 +197,25 @@ class RAGChunker:
# Match code blocks (``` fenced blocks)
# Use DOTALL flag to match across newlines
code_block_pattern = r'```[^\n]*\n.*?```'
code_block_pattern = r"```[^\n]*\n.*?```"
def replacer(match):
idx = len(code_blocks)
code_blocks.append({
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end()
})
code_blocks.append(
{
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end(),
}
)
return placeholder_pattern.format(idx=idx)
text_with_placeholders = re.sub(
code_block_pattern,
replacer,
text,
flags=re.DOTALL
)
text_with_placeholders = re.sub(code_block_pattern, replacer, text, flags=re.DOTALL)
return text_with_placeholders, code_blocks
def _reinsert_code_blocks(
self,
chunks: list[str],
code_blocks: list[dict]
) -> list[str]:
def _reinsert_code_blocks(self, chunks: list[str], code_blocks: list[dict]) -> list[str]:
"""
Re-insert code blocks into chunks.
@@ -249,7 +232,7 @@ class RAGChunker:
for block in code_blocks:
placeholder = f"<<CODE_BLOCK_{block['index']}>>"
if placeholder in chunk:
chunk = chunk.replace(placeholder, block['content'])
chunk = chunk.replace(placeholder, block["content"])
result.append(chunk)
return result
@@ -268,15 +251,15 @@ class RAGChunker:
# Paragraph boundaries (double newline)
if self.preserve_paragraphs:
for match in re.finditer(r'\n\n+', text):
for match in re.finditer(r"\n\n+", text):
boundaries.append(match.end())
# Section headers (# Header)
for match in re.finditer(r'\n#{1,6}\s+.+\n', text):
for match in re.finditer(r"\n#{1,6}\s+.+\n", text):
boundaries.append(match.start())
# Single newlines (less preferred, but useful)
for match in re.finditer(r'\n', text):
for match in re.finditer(r"\n", text):
boundaries.append(match.start())
# Add artificial boundaries for large documents
@@ -352,7 +335,9 @@ class RAGChunker:
# Add chunk if it meets minimum size requirement
# (unless the entire text is smaller than target size)
if chunk_text.strip() and (len(text) <= target_size_chars or len(chunk_text) >= min_size_chars):
if chunk_text.strip() and (
len(text) <= target_size_chars or len(chunk_text) >= min_size_chars
):
chunks.append(chunk_text)
# Move to next chunk with overlap
@@ -383,7 +368,7 @@ class RAGChunker:
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(chunks, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(chunks)} chunks to {output_path}")
@@ -393,7 +378,9 @@ def main():
"""CLI entry point for testing RAG chunker."""
import argparse
parser = argparse.ArgumentParser(description="RAG Chunker - Semantic chunking for RAG pipelines")
parser = argparse.ArgumentParser(
description="RAG Chunker - Semantic chunking for RAG pipelines"
)
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")

View File

@@ -59,27 +59,26 @@ def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
account_name='myaccount')
"""
adaptors = {
's3': S3StorageAdaptor,
'gcs': GCSStorageAdaptor,
'azure': AzureStorageAdaptor,
"s3": S3StorageAdaptor,
"gcs": GCSStorageAdaptor,
"azure": AzureStorageAdaptor,
}
provider_lower = provider.lower()
if provider_lower not in adaptors:
supported = ', '.join(adaptors.keys())
supported = ", ".join(adaptors.keys())
raise ValueError(
f"Unsupported storage provider: {provider}. "
f"Supported providers: {supported}"
f"Unsupported storage provider: {provider}. Supported providers: {supported}"
)
return adaptors[provider_lower](**kwargs)
__all__ = [
'BaseStorageAdaptor',
'StorageObject',
'S3StorageAdaptor',
'GCSStorageAdaptor',
'AzureStorageAdaptor',
'get_storage_adaptor',
"BaseStorageAdaptor",
"StorageObject",
"S3StorageAdaptor",
"GCSStorageAdaptor",
"AzureStorageAdaptor",
"get_storage_adaptor",
]

View File

@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
try:
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
from azure.core.exceptions import ResourceNotFoundError
AZURE_AVAILABLE = True
except ImportError:
AZURE_AVAILABLE = False
@@ -65,38 +66,30 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
"Install with: pip install azure-storage-blob"
)
if 'container' not in kwargs:
if "container" not in kwargs:
raise ValueError("container parameter is required for Azure storage")
self.container_name = kwargs['container']
self.container_name = kwargs["container"]
# Initialize BlobServiceClient
if 'connection_string' in kwargs:
connection_string = kwargs['connection_string']
if "connection_string" in kwargs:
connection_string = kwargs["connection_string"]
else:
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
if connection_string:
self.blob_service_client = BlobServiceClient.from_connection_string(
connection_string
)
self.blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# Extract account name from connection string
self.account_name = None
self.account_key = None
for part in connection_string.split(';'):
if part.startswith('AccountName='):
self.account_name = part.split('=', 1)[1]
elif part.startswith('AccountKey='):
self.account_key = part.split('=', 1)[1]
for part in connection_string.split(";"):
if part.startswith("AccountName="):
self.account_name = part.split("=", 1)[1]
elif part.startswith("AccountKey="):
self.account_key = part.split("=", 1)[1]
else:
account_name = kwargs.get(
'account_name',
os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
)
account_key = kwargs.get(
'account_key',
os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
)
account_name = kwargs.get("account_name", os.getenv("AZURE_STORAGE_ACCOUNT_NAME"))
account_key = kwargs.get("account_key", os.getenv("AZURE_STORAGE_ACCOUNT_KEY"))
if not account_name or not account_key:
raise ValueError(
@@ -108,13 +101,10 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
self.account_key = account_key
account_url = f"https://{account_name}.blob.core.windows.net"
self.blob_service_client = BlobServiceClient(
account_url=account_url,
credential=account_key
account_url=account_url, credential=account_key
)
self.container_client = self.blob_service_client.get_container_client(
self.container_name
)
self.container_client = self.blob_service_client.get_container_client(self.container_name)
def upload_file(
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
@@ -128,11 +118,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
blob_client = self.container_client.get_blob_client(remote_path)
with open(local_file, "rb") as data:
blob_client.upload_blob(
data,
overwrite=True,
metadata=metadata
)
blob_client.upload_blob(data, overwrite=True, metadata=metadata)
return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
except Exception as e:
@@ -164,25 +150,26 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
except Exception as e:
raise Exception(f"Azure deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in Azure container."""
try:
blobs = self.container_client.list_blobs(
name_starts_with=prefix,
results_per_page=max_results
name_starts_with=prefix, results_per_page=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
etag=blob.etag,
metadata=blob.metadata
))
files.append(
StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.last_modified.isoformat()
if blob.last_modified
else None,
etag=blob.etag,
metadata=blob.metadata,
)
)
return files
except Exception as e:
@@ -205,9 +192,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
raise FileNotFoundError(f"Remote file not found: {remote_path}")
if not self.account_name or not self.account_key:
raise ValueError(
"Account name and key are required for SAS URL generation"
)
raise ValueError("Account name and key are required for SAS URL generation")
sas_token = generate_blob_sas(
account_name=self.account_name,
@@ -215,7 +200,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
blob_name=remote_path,
account_key=self.account_key,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(seconds=expires_in)
expiry=datetime.utcnow() + timedelta(seconds=expires_in),
)
return f"{blob_client.url}?{sas_token}"
@@ -239,12 +224,13 @@ class AzureStorageAdaptor(BaseStorageAdaptor):
# Wait for copy to complete
properties = dest_blob.get_blob_properties()
while properties.copy.status == 'pending':
while properties.copy.status == "pending":
import time
time.sleep(0.1)
properties = dest_blob.get_blob_properties()
if properties.copy.status != 'success':
if properties.copy.status != "success":
raise Exception(f"Copy failed with status: {properties.copy.status}")
except FileNotFoundError:

View File

@@ -95,9 +95,7 @@ class BaseStorageAdaptor(ABC):
pass
@abstractmethod
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""
List files in cloud storage.
@@ -191,9 +189,7 @@ class BaseStorageAdaptor(ABC):
return uploaded_files
def download_directory(
self, remote_prefix: str, local_dir: str
) -> list[str]:
def download_directory(self, remote_prefix: str, local_dir: str) -> list[str]:
"""
Download directory from cloud storage.
@@ -245,9 +241,7 @@ class BaseStorageAdaptor(ABC):
raise FileNotFoundError(f"File not found: {remote_path}")
return files[0].size
def copy_file(
self, source_path: str, dest_path: str
) -> None:
def copy_file(self, source_path: str, dest_path: str) -> None:
"""
Copy file within cloud storage.

View File

@@ -9,6 +9,7 @@ from datetime import timedelta
try:
from google.cloud import storage
from google.cloud.exceptions import NotFound
GCS_AVAILABLE = True
except ImportError:
GCS_AVAILABLE = False
@@ -63,19 +64,19 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
"Install with: pip install google-cloud-storage"
)
if 'bucket' not in kwargs:
if "bucket" not in kwargs:
raise ValueError("bucket parameter is required for GCS storage")
self.bucket_name = kwargs['bucket']
self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
self.bucket_name = kwargs["bucket"]
self.project = kwargs.get("project", os.getenv("GOOGLE_CLOUD_PROJECT"))
# Initialize GCS client
client_kwargs = {}
if self.project:
client_kwargs['project'] = self.project
client_kwargs["project"] = self.project
if 'credentials_path' in kwargs:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
if "credentials_path" in kwargs:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = kwargs["credentials_path"]
self.storage_client = storage.Client(**client_kwargs)
self.bucket = self.storage_client.bucket(self.bucket_name)
@@ -122,26 +123,24 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
except Exception as e:
raise Exception(f"GCS deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in GCS bucket."""
try:
blobs = self.storage_client.list_blobs(
self.bucket_name,
prefix=prefix,
max_results=max_results
self.bucket_name, prefix=prefix, max_results=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.updated.isoformat() if blob.updated else None,
etag=blob.etag,
metadata=blob.metadata
))
files.append(
StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.updated.isoformat() if blob.updated else None,
etag=blob.etag,
metadata=blob.metadata,
)
)
return files
except Exception as e:
@@ -164,9 +163,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
raise FileNotFoundError(f"Remote file not found: {remote_path}")
url = blob.generate_signed_url(
version="v4",
expiration=timedelta(seconds=expires_in),
method="GET"
version="v4", expiration=timedelta(seconds=expires_in), method="GET"
)
return url
except FileNotFoundError:
@@ -182,11 +179,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor):
if not source_blob.exists():
raise FileNotFoundError(f"Source file not found: {source_path}")
self.bucket.copy_blob(
source_blob,
self.bucket,
dest_path
)
self.bucket.copy_blob(source_blob, self.bucket, dest_path)
except FileNotFoundError:
raise
except Exception as e:

View File

@@ -8,6 +8,7 @@ from pathlib import Path
try:
import boto3
from botocore.exceptions import ClientError
BOTO3_AVAILABLE = True
except ImportError:
BOTO3_AVAILABLE = False
@@ -63,33 +64,30 @@ class S3StorageAdaptor(BaseStorageAdaptor):
super().__init__(**kwargs)
if not BOTO3_AVAILABLE:
raise ImportError(
"boto3 is required for S3 storage. "
"Install with: pip install boto3"
)
raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3")
if 'bucket' not in kwargs:
if "bucket" not in kwargs:
raise ValueError("bucket parameter is required for S3 storage")
self.bucket = kwargs['bucket']
self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
self.bucket = kwargs["bucket"]
self.region = kwargs.get("region", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
# Initialize S3 client
client_kwargs = {
'region_name': self.region,
"region_name": self.region,
}
if 'endpoint_url' in kwargs:
client_kwargs['endpoint_url'] = kwargs['endpoint_url']
if "endpoint_url" in kwargs:
client_kwargs["endpoint_url"] = kwargs["endpoint_url"]
if 'aws_access_key_id' in kwargs:
client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
if "aws_access_key_id" in kwargs:
client_kwargs["aws_access_key_id"] = kwargs["aws_access_key_id"]
if 'aws_secret_access_key' in kwargs:
client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
if "aws_secret_access_key" in kwargs:
client_kwargs["aws_secret_access_key"] = kwargs["aws_secret_access_key"]
self.s3_client = boto3.client('s3', **client_kwargs)
self.s3_resource = boto3.resource('s3', **client_kwargs)
self.s3_client = boto3.client("s3", **client_kwargs)
self.s3_resource = boto3.resource("s3", **client_kwargs)
def upload_file(
self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None
@@ -101,14 +99,14 @@ class S3StorageAdaptor(BaseStorageAdaptor):
extra_args = {}
if metadata:
extra_args['Metadata'] = metadata
extra_args["Metadata"] = metadata
try:
self.s3_client.upload_file(
str(local_file),
self.bucket,
remote_path,
ExtraArgs=extra_args if extra_args else None
ExtraArgs=extra_args if extra_args else None,
)
return f"s3://{self.bucket}/{remote_path}"
except ClientError as e:
@@ -120,50 +118,41 @@ class S3StorageAdaptor(BaseStorageAdaptor):
local_file.parent.mkdir(parents=True, exist_ok=True)
try:
self.s3_client.download_file(
self.bucket,
remote_path,
str(local_file)
)
self.s3_client.download_file(self.bucket, remote_path, str(local_file))
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
raise FileNotFoundError(f"Remote file not found: {remote_path}") from e
raise Exception(f"S3 download failed: {e}") from e
def delete_file(self, remote_path: str) -> None:
"""Delete file from S3."""
try:
self.s3_client.delete_object(
Bucket=self.bucket,
Key=remote_path
)
self.s3_client.delete_object(Bucket=self.bucket, Key=remote_path)
except ClientError as e:
raise Exception(f"S3 deletion failed: {e}") from e
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> list[StorageObject]:
def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]:
"""List files in S3 bucket."""
try:
paginator = self.s3_client.get_paginator('list_objects_v2')
paginator = self.s3_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(
Bucket=self.bucket,
Prefix=prefix,
PaginationConfig={'MaxItems': max_results}
Bucket=self.bucket, Prefix=prefix, PaginationConfig={"MaxItems": max_results}
)
files = []
for page in page_iterator:
if 'Contents' not in page:
if "Contents" not in page:
continue
for obj in page['Contents']:
files.append(StorageObject(
key=obj['Key'],
size=obj['Size'],
last_modified=obj['LastModified'].isoformat(),
etag=obj.get('ETag', '').strip('"')
))
for obj in page["Contents"]:
files.append(
StorageObject(
key=obj["Key"],
size=obj["Size"],
last_modified=obj["LastModified"].isoformat(),
etag=obj.get("ETag", "").strip('"'),
)
)
return files
except ClientError as e:
@@ -172,13 +161,10 @@ class S3StorageAdaptor(BaseStorageAdaptor):
def file_exists(self, remote_path: str) -> bool:
"""Check if file exists in S3."""
try:
self.s3_client.head_object(
Bucket=self.bucket,
Key=remote_path
)
self.s3_client.head_object(Bucket=self.bucket, Key=remote_path)
return True
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
return False
raise Exception(f"S3 head_object failed: {e}") from e
@@ -186,12 +172,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
"""Generate presigned URL for S3 object."""
try:
url = self.s3_client.generate_presigned_url(
'get_object',
Params={
'Bucket': self.bucket,
'Key': remote_path
},
ExpiresIn=expires_in
"get_object",
Params={"Bucket": self.bucket, "Key": remote_path},
ExpiresIn=expires_in,
)
return url
except ClientError as e:
@@ -200,16 +183,9 @@ class S3StorageAdaptor(BaseStorageAdaptor):
def copy_file(self, source_path: str, dest_path: str) -> None:
"""Copy file within S3 bucket (server-side copy)."""
try:
copy_source = {
'Bucket': self.bucket,
'Key': source_path
}
self.s3_client.copy_object(
CopySource=copy_source,
Bucket=self.bucket,
Key=dest_path
)
copy_source = {"Bucket": self.bucket, "Key": source_path}
self.s3_client.copy_object(CopySource=copy_source, Bucket=self.bucket, Key=dest_path)
except ClientError as e:
if e.response['Error']['Code'] == '404':
if e.response["Error"]["Code"] == "404":
raise FileNotFoundError(f"Source file not found: {source_path}") from e
raise Exception(f"S3 copy failed: {e}") from e

View File

@@ -17,6 +17,7 @@ import time
@dataclass
class ChunkMetadata:
"""Metadata for a document chunk."""
chunk_id: str
source: str
category: str
@@ -30,6 +31,7 @@ class ChunkMetadata:
@dataclass
class IngestionProgress:
"""Progress tracking for streaming ingestion."""
total_documents: int
processed_documents: int
total_chunks: int
@@ -81,7 +83,7 @@ class StreamingIngester:
chunk_size: int = 4000,
chunk_overlap: int = 200,
batch_size: int = 100,
max_memory_mb: int = 500
max_memory_mb: int = 500,
):
"""
Initialize streaming ingester.
@@ -103,7 +105,7 @@ class StreamingIngester:
content: str,
metadata: dict,
chunk_size: int | None = None,
chunk_overlap: int | None = None
chunk_overlap: int | None = None,
) -> Iterator[tuple[str, ChunkMetadata]]:
"""
Split document into overlapping chunks.
@@ -130,7 +132,7 @@ class StreamingIngester:
chunk_index=0,
total_chunks=1,
char_start=0,
char_end=len(content)
char_end=len(content),
)
yield content, chunk_meta
return
@@ -162,7 +164,7 @@ class StreamingIngester:
chunk_index=i,
total_chunks=total_chunks,
char_start=start,
char_end=end
char_end=end,
)
yield chunk_text, chunk_meta
@@ -170,17 +172,12 @@ class StreamingIngester:
def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str:
"""Generate deterministic chunk ID."""
id_string = (
f"{metadata.get('source', '')}-"
f"{metadata.get('file', '')}-"
f"{chunk_index}-"
f"{content[:50]}"
f"{metadata.get('source', '')}-{metadata.get('file', '')}-{chunk_index}-{content[:50]}"
)
return hashlib.md5(id_string.encode()).hexdigest()
def stream_skill_directory(
self,
skill_dir: Path,
callback: callable | None = None
self, skill_dir: Path, callback: callable | None = None
) -> Iterator[tuple[str, dict]]:
"""
Stream all documents from skill directory.
@@ -218,7 +215,7 @@ class StreamingIngester:
processed_chunks=0,
failed_chunks=0,
bytes_processed=0,
start_time=time.time()
start_time=time.time(),
)
# Process each document
@@ -235,11 +232,13 @@ class StreamingIngester:
"category": category,
"file": filename,
"type": "documentation" if filename == "SKILL.md" else "reference",
"version": "1.0.0"
"version": "1.0.0",
}
# Chunk document and yield chunks
for chunk_count, (chunk_text, chunk_meta) in enumerate(self.chunk_document(content, metadata), start=1):
for chunk_count, (chunk_text, chunk_meta) in enumerate(
self.chunk_document(content, metadata), start=1
):
self.progress.total_chunks += 1
# Convert chunk metadata to dict
@@ -272,9 +271,7 @@ class StreamingIngester:
continue
def batch_iterator(
self,
chunks: Iterator[tuple[str, dict]],
batch_size: int | None = None
self, chunks: Iterator[tuple[str, dict]], batch_size: int | None = None
) -> Iterator[list[tuple[str, dict]]]:
"""
Group chunks into batches for efficient processing.
@@ -321,7 +318,7 @@ class StreamingIngester:
"failed_chunks": self.progress.failed_chunks,
"bytes_processed": self.progress.bytes_processed,
},
"state": state
"state": state,
}
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
@@ -384,23 +381,25 @@ def main():
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
parser.add_argument("input", help="Input file or directory path")
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap in characters")
parser.add_argument(
"--chunk-overlap", type=int, default=200, help="Chunk overlap in characters"
)
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
parser.add_argument("--checkpoint", help="Checkpoint file path")
args = parser.parse_args()
# Initialize ingester
ingester = StreamingIngester(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
batch_size=args.batch_size
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size
)
# Progress callback
def on_progress(progress: IngestionProgress):
if progress.processed_chunks % 10 == 0:
print(f"Progress: {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks")
print(
f"Progress: {progress.progress_percent:.1f}% - "
f"{progress.processed_chunks}/{progress.total_chunks} chunks"
)
# Stream input
input_path = Path(args.input)
@@ -416,17 +415,23 @@ def main():
metadata = {"source": input_path.stem, "file": input_path.name}
file_chunks = ingester.chunk_document(content, metadata)
# Convert to generator format matching stream_skill_directory
chunks = ((text, {
"content": text,
"chunk_id": meta.chunk_id,
"source": meta.source,
"category": meta.category,
"file": meta.file,
"chunk_index": meta.chunk_index,
"total_chunks": meta.total_chunks,
"char_start": meta.char_start,
"char_end": meta.char_end,
}) for text, meta in file_chunks)
chunks = (
(
text,
{
"content": text,
"chunk_id": meta.chunk_id,
"source": meta.source,
"category": meta.category,
"file": meta.file,
"chunk_index": meta.chunk_index,
"total_chunks": meta.total_chunks,
"char_start": meta.char_start,
"char_end": meta.char_end,
},
)
for text, meta in file_chunks
)
# Process in batches
all_chunks = []
@@ -437,8 +442,7 @@ def main():
# Save checkpoint if specified
if args.checkpoint:
ingester.save_checkpoint(
Path(args.checkpoint),
{"processed_batches": len(all_chunks) // args.batch_size}
Path(args.checkpoint), {"processed_batches": len(all_chunks) // args.batch_size}
)
# Final progress
@@ -449,4 +453,5 @@ def main():
if __name__ == "__main__":
import sys
sys.exit(main())

View File

@@ -22,9 +22,7 @@ def handle_signal(_signum, _frame):
def start_command(args):
"""Start monitoring."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=args.interval,
auto_update=args.auto_update
config_path=args.config, check_interval=args.interval, auto_update=args.auto_update
)
# Register signal handlers
@@ -42,6 +40,7 @@ def start_command(args):
# Keep running
while True:
import time
time.sleep(1)
except KeyboardInterrupt:
@@ -53,7 +52,7 @@ def check_command(args):
"""Check for changes once."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=3600 # Not used for single check
check_interval=3600, # Not used for single check
)
print(f"🔍 Checking {args.config} for changes...")
@@ -82,7 +81,7 @@ def check_command(args):
print(f"{change.url}")
if change.diff and args.diff:
print(f" Diff preview (first 5 lines):")
for line in change.diff.split('\n')[:5]:
for line in change.diff.split("\n")[:5]:
print(f" {line}")
if report.deleted:
@@ -95,10 +94,7 @@ def check_command(args):
def stats_command(args):
"""Show monitoring statistics."""
monitor = SyncMonitor(
config_path=args.config,
check_interval=3600
)
monitor = SyncMonitor(config_path=args.config, check_interval=3600)
stats = monitor.stats()
@@ -117,7 +113,7 @@ def reset_command(args):
state_file = Path(f"{args.skill_name}_sync.json")
if state_file.exists():
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == 'y':
if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == "y":
state_file.unlink()
print(f"✅ State reset for {args.skill_name}")
else:
@@ -129,7 +125,7 @@ def reset_command(args):
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Monitor documentation for changes and update skills',
description="Monitor documentation for changes and update skills",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -153,52 +149,39 @@ Examples:
# Reset state
skill-seekers-sync reset --skill-name react
"""
""",
)
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
subparsers = parser.add_subparsers(dest="command", help="Command to execute")
# Start command
start_parser = subparsers.add_parser('start', help='Start continuous monitoring')
start_parser.add_argument('--config', required=True, help='Path to skill config file')
start_parser = subparsers.add_parser("start", help="Start continuous monitoring")
start_parser.add_argument("--config", required=True, help="Path to skill config file")
start_parser.add_argument(
'--interval', '-i',
"--interval",
"-i",
type=int,
default=3600,
help='Check interval in seconds (default: 3600 = 1 hour)'
help="Check interval in seconds (default: 3600 = 1 hour)",
)
start_parser.add_argument(
'--auto-update',
action='store_true',
help='Automatically rebuild skill on changes'
"--auto-update", action="store_true", help="Automatically rebuild skill on changes"
)
# Check command
check_parser = subparsers.add_parser('check', help='Check for changes once')
check_parser.add_argument('--config', required=True, help='Path to skill config file')
check_parser.add_argument(
'--diff', '-d',
action='store_true',
help='Generate content diffs'
)
check_parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Show detailed output'
)
check_parser = subparsers.add_parser("check", help="Check for changes once")
check_parser.add_argument("--config", required=True, help="Path to skill config file")
check_parser.add_argument("--diff", "-d", action="store_true", help="Generate content diffs")
check_parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output")
# Stats command
stats_parser = subparsers.add_parser('stats', help='Show monitoring statistics')
stats_parser.add_argument('--config', required=True, help='Path to skill config file')
stats_parser = subparsers.add_parser("stats", help="Show monitoring statistics")
stats_parser.add_argument("--config", required=True, help="Path to skill config file")
# Reset command
reset_parser = subparsers.add_parser('reset', help='Reset monitoring state')
reset_parser.add_argument('--skill-name', required=True, help='Skill name')
reset_parser.add_argument(
'--force', '-f',
action='store_true',
help='Skip confirmation'
)
reset_parser = subparsers.add_parser("reset", help="Reset monitoring state")
reset_parser.add_argument("--skill-name", required=True, help="Skill name")
reset_parser.add_argument("--force", "-f", action="store_true", help="Skip confirmation")
args = parser.parse_args()
@@ -207,18 +190,18 @@ Examples:
sys.exit(1)
try:
if args.command == 'start':
if args.command == "start":
start_command(args)
elif args.command == 'check':
elif args.command == "check":
check_command(args)
elif args.command == 'stats':
elif args.command == "stats":
stats_command(args)
elif args.command == 'reset':
elif args.command == "reset":
reset_command(args)
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -59,7 +59,7 @@ def upload_skill_api(package_path, target="claude", api_key=None, **kwargs):
api_key = os.environ.get(adaptor.get_env_var_name(), "").strip()
# API key validation only for platforms that require it
if target in ['claude', 'gemini', 'openai']:
if target in ["claude", "gemini", "openai"]:
if not api_key:
return False, f"{adaptor.get_env_var_name()} not set. Export your API key first."
@@ -172,41 +172,39 @@ Examples:
# ChromaDB upload options
parser.add_argument(
"--chroma-url",
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)"
help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)",
)
parser.add_argument(
"--persist-directory",
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)"
help="Local directory for persistent ChromaDB storage (default: ./chroma_db)",
)
parser.add_argument(
"--embedding-function",
choices=["openai", "sentence-transformers", "none"],
help="Embedding function for ChromaDB/Weaviate (default: platform default)"
help="Embedding function for ChromaDB/Weaviate (default: platform default)",
)
parser.add_argument(
"--openai-api-key",
help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
"--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)"
)
# Weaviate upload options
parser.add_argument(
"--weaviate-url",
default="http://localhost:8080",
help="Weaviate URL (default: http://localhost:8080)"
help="Weaviate URL (default: http://localhost:8080)",
)
parser.add_argument(
"--use-cloud",
action="store_true",
help="Use Weaviate Cloud (requires --api-key and --cluster-url)"
help="Use Weaviate Cloud (requires --api-key and --cluster-url)",
)
parser.add_argument(
"--cluster-url",
help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
"--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)"
)
args = parser.parse_args()
@@ -214,28 +212,30 @@ Examples:
# Build kwargs for vector DB upload
upload_kwargs = {}
if args.target == 'chroma':
if args.target == "chroma":
if args.chroma_url:
upload_kwargs['chroma_url'] = args.chroma_url
upload_kwargs["chroma_url"] = args.chroma_url
if args.persist_directory:
upload_kwargs['persist_directory'] = args.persist_directory
upload_kwargs["persist_directory"] = args.persist_directory
if args.embedding_function:
upload_kwargs['embedding_function'] = args.embedding_function
upload_kwargs["embedding_function"] = args.embedding_function
if args.openai_api_key:
upload_kwargs['openai_api_key'] = args.openai_api_key
upload_kwargs["openai_api_key"] = args.openai_api_key
elif args.target == 'weaviate':
upload_kwargs['weaviate_url'] = args.weaviate_url
upload_kwargs['use_cloud'] = args.use_cloud
elif args.target == "weaviate":
upload_kwargs["weaviate_url"] = args.weaviate_url
upload_kwargs["use_cloud"] = args.use_cloud
if args.cluster_url:
upload_kwargs['cluster_url'] = args.cluster_url
upload_kwargs["cluster_url"] = args.cluster_url
if args.embedding_function:
upload_kwargs['embedding_function'] = args.embedding_function
upload_kwargs["embedding_function"] = args.embedding_function
if args.openai_api_key:
upload_kwargs['openai_api_key'] = args.openai_api_key
upload_kwargs["openai_api_key"] = args.openai_api_key
# Upload skill
success, message = upload_skill_api(args.package_file, args.target, args.api_key, **upload_kwargs)
success, message = upload_skill_api(
args.package_file, args.target, args.api_key, **upload_kwargs
)
if success:
sys.exit(0)

View File

@@ -23,9 +23,9 @@ from .generator import EmbeddingGenerator
from .cache import EmbeddingCache
__all__ = [
'EmbeddingRequest',
'EmbeddingResponse',
'BatchEmbeddingRequest',
'EmbeddingGenerator',
'EmbeddingCache',
"EmbeddingRequest",
"EmbeddingResponse",
"BatchEmbeddingRequest",
"EmbeddingGenerator",
"EmbeddingCache",
]

View File

@@ -74,12 +74,7 @@ class EmbeddingCache:
self.conn.commit()
def set(
self,
hash_key: str,
embedding: list[float],
model: str
) -> None:
def set(self, hash_key: str, embedding: list[float], model: str) -> None:
"""
Store embedding in cache.
@@ -94,11 +89,14 @@ class EmbeddingCache:
embedding_json = json.dumps(embedding)
dimensions = len(embedding)
cursor.execute("""
cursor.execute(
"""
INSERT OR REPLACE INTO embeddings
(hash, embedding, model, dimensions, created_at, accessed_at, access_count)
VALUES (?, ?, ?, ?, ?, ?, 1)
""", (hash_key, embedding_json, model, dimensions, now, now))
""",
(hash_key, embedding_json, model, dimensions, now, now),
)
self.conn.commit()
@@ -115,11 +113,14 @@ class EmbeddingCache:
cursor = self.conn.cursor()
# Get embedding
cursor.execute("""
cursor.execute(
"""
SELECT embedding, created_at
FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
row = cursor.fetchone()
if not row:
@@ -136,11 +137,14 @@ class EmbeddingCache:
# Update access stats
now = datetime.utcnow().isoformat()
cursor.execute("""
cursor.execute(
"""
UPDATE embeddings
SET accessed_at = ?, access_count = access_count + 1
WHERE hash = ?
""", (now, hash_key))
""",
(now, hash_key),
)
self.conn.commit()
return json.loads(embedding_json)
@@ -178,11 +182,14 @@ class EmbeddingCache:
"""
cursor = self.conn.cursor()
cursor.execute("""
cursor.execute(
"""
SELECT created_at
FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
row = cursor.fetchone()
if not row:
@@ -206,10 +213,13 @@ class EmbeddingCache:
"""
cursor = self.conn.cursor()
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE hash = ?
""", (hash_key,))
""",
(hash_key,),
)
self.conn.commit()
@@ -226,10 +236,13 @@ class EmbeddingCache:
cursor = self.conn.cursor()
if model:
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE model = ?
""", (model,))
""",
(model,),
)
else:
cursor.execute("DELETE FROM embeddings")
@@ -249,10 +262,13 @@ class EmbeddingCache:
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
cursor.execute("""
cursor.execute(
"""
DELETE FROM embeddings
WHERE created_at < ?
""", (cutoff,))
""",
(cutoff,),
)
deleted = cursor.rowcount
self.conn.commit()
@@ -300,17 +316,19 @@ class EmbeddingCache:
LIMIT 10
""")
top_accessed = [
{"hash": row[0], "model": row[1], "access_count": row[2]}
for row in cursor.fetchall()
{"hash": row[0], "model": row[1], "access_count": row[2]} for row in cursor.fetchall()
]
# Expired entries
cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat()
cursor.execute("""
cursor.execute(
"""
SELECT COUNT(*)
FROM embeddings
WHERE created_at < ?
""", (cutoff,))
""",
(cutoff,),
)
expired = cursor.fetchone()[0]
return {
@@ -318,7 +336,7 @@ class EmbeddingCache:
"by_model": by_model,
"top_accessed": top_accessed,
"expired": expired,
"ttl_days": self.ttl_days
"ttl_days": self.ttl_days,
}
def close(self):

View File

@@ -9,6 +9,7 @@ import numpy as np
# OpenAI support
try:
from openai import OpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
@@ -16,6 +17,7 @@ except ImportError:
# Sentence transformers support
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
@@ -23,6 +25,7 @@ except ImportError:
# Voyage AI support (recommended by Anthropic for embeddings)
try:
import voyageai
VOYAGE_AVAILABLE = True
except ImportError:
VOYAGE_AVAILABLE = False
@@ -129,7 +132,7 @@ class EmbeddingGenerator:
self,
api_key: str | None = None,
voyage_api_key: str | None = None,
cache_dir: str | None = None
cache_dir: str | None = None,
):
"""
Initialize embedding generator.
@@ -162,8 +165,7 @@ class EmbeddingGenerator:
"""Get information about a model."""
if model not in self.MODELS:
raise ValueError(
f"Unknown model: {model}. "
f"Available models: {', '.join(self.MODELS.keys())}"
f"Unknown model: {model}. Available models: {', '.join(self.MODELS.keys())}"
)
return self.MODELS[model]
@@ -171,20 +173,19 @@ class EmbeddingGenerator:
"""List all available models."""
models = []
for name, info in self.MODELS.items():
models.append({
"name": name,
"provider": info["provider"],
"dimensions": info["dimensions"],
"max_tokens": info["max_tokens"],
"cost_per_million": info.get("cost_per_million", 0.0),
})
models.append(
{
"name": name,
"provider": info["provider"],
"dimensions": info["dimensions"],
"max_tokens": info["max_tokens"],
"cost_per_million": info.get("cost_per_million", 0.0),
}
)
return models
def generate(
self,
text: str,
model: str = "text-embedding-3-small",
normalize: bool = True
self, text: str, model: str = "text-embedding-3-small", normalize: bool = True
) -> list[float]:
"""
Generate embedding for a single text.
@@ -218,7 +219,7 @@ class EmbeddingGenerator:
texts: list[str],
model: str = "text-embedding-3-small",
normalize: bool = True,
batch_size: int = 32
batch_size: int = 32,
) -> tuple[list[list[float]], int]:
"""
Generate embeddings for multiple texts.
@@ -248,24 +249,18 @@ class EmbeddingGenerator:
else:
raise ValueError(f"Unsupported provider: {provider}")
def _generate_openai(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_openai(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using OpenAI API."""
if not OPENAI_AVAILABLE:
raise ImportError(
"OpenAI is required for OpenAI embeddings. "
"Install with: pip install openai"
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
)
if not self.openai_client:
raise ValueError("OpenAI API key not provided")
try:
response = self.openai_client.embeddings.create(
input=text,
model=model
)
response = self.openai_client.embeddings.create(input=text, model=model)
embedding = response.data[0].embedding
if normalize:
@@ -281,8 +276,7 @@ class EmbeddingGenerator:
"""Generate embeddings using OpenAI API in batches."""
if not OPENAI_AVAILABLE:
raise ImportError(
"OpenAI is required for OpenAI embeddings. "
"Install with: pip install openai"
"OpenAI is required for OpenAI embeddings. Install with: pip install openai"
)
if not self.openai_client:
@@ -292,13 +286,10 @@ class EmbeddingGenerator:
# Process in batches
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch = texts[i : i + batch_size]
try:
response = self.openai_client.embeddings.create(
input=batch,
model=model
)
response = self.openai_client.embeddings.create(input=batch, model=model)
batch_embeddings = [item.embedding for item in response.data]
@@ -313,24 +304,18 @@ class EmbeddingGenerator:
dimensions = len(all_embeddings[0]) if all_embeddings else 0
return all_embeddings, dimensions
def _generate_voyage(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_voyage(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using Voyage AI API."""
if not VOYAGE_AVAILABLE:
raise ImportError(
"voyageai is required for Voyage AI embeddings. "
"Install with: pip install voyageai"
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
)
if not self.voyage_client:
raise ValueError("Voyage API key not provided")
try:
result = self.voyage_client.embed(
texts=[text],
model=model
)
result = self.voyage_client.embed(texts=[text], model=model)
embedding = result.embeddings[0]
if normalize:
@@ -346,8 +331,7 @@ class EmbeddingGenerator:
"""Generate embeddings using Voyage AI API in batches."""
if not VOYAGE_AVAILABLE:
raise ImportError(
"voyageai is required for Voyage AI embeddings. "
"Install with: pip install voyageai"
"voyageai is required for Voyage AI embeddings. Install with: pip install voyageai"
)
if not self.voyage_client:
@@ -357,13 +341,10 @@ class EmbeddingGenerator:
# Process in batches (Voyage AI supports up to 128 texts per request)
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch = texts[i : i + batch_size]
try:
result = self.voyage_client.embed(
texts=batch,
model=model
)
result = self.voyage_client.embed(texts=batch, model=model)
batch_embeddings = result.embeddings
@@ -378,9 +359,7 @@ class EmbeddingGenerator:
dimensions = len(all_embeddings[0]) if all_embeddings else 0
return all_embeddings, dimensions
def _generate_sentence_transformer(
self, text: str, model: str, normalize: bool
) -> list[float]:
def _generate_sentence_transformer(self, text: str, model: str, normalize: bool) -> list[float]:
"""Generate embedding using sentence-transformers."""
if not SENTENCE_TRANSFORMERS_AVAILABLE:
raise ImportError(
@@ -417,10 +396,7 @@ class EmbeddingGenerator:
# Generate embeddings in batches
embeddings = st_model.encode(
texts,
batch_size=batch_size,
normalize_embeddings=normalize,
show_progress_bar=False
texts, batch_size=batch_size, normalize_embeddings=normalize, show_progress_bar=False
)
dimensions = len(embeddings[0]) if len(embeddings) > 0 else 0

View File

@@ -14,20 +14,14 @@ class EmbeddingRequest(BaseModel):
"example": {
"text": "This is a test document about Python programming.",
"model": "text-embedding-3-small",
"normalize": True
"normalize": True,
}
}
)
text: str = Field(..., description="Text to generate embedding for")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
normalize: bool = Field(
default=True,
description="Normalize embeddings to unit length"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
class BatchEmbeddingRequest(BaseModel):
@@ -39,27 +33,20 @@ class BatchEmbeddingRequest(BaseModel):
"texts": [
"First document about Python",
"Second document about JavaScript",
"Third document about Rust"
"Third document about Rust",
],
"model": "text-embedding-3-small",
"normalize": True,
"batch_size": 32
"batch_size": 32,
}
}
)
texts: list[str] = Field(..., description="List of texts to embed")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
normalize: bool = Field(
default=True,
description="Normalize embeddings to unit length"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
normalize: bool = Field(default=True, description="Normalize embeddings to unit length")
batch_size: int | None = Field(
default=32,
description="Batch size for processing (default: 32)"
default=32, description="Batch size for processing (default: 32)"
)
@@ -69,10 +56,7 @@ class EmbeddingResponse(BaseModel):
embedding: list[float] = Field(..., description="Generated embedding vector")
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
cached: bool = Field(
default=False,
description="Whether embedding was retrieved from cache"
)
cached: bool = Field(default=False, description="Whether embedding was retrieved from cache")
class BatchEmbeddingResponse(BaseModel):
@@ -82,10 +66,7 @@ class BatchEmbeddingResponse(BaseModel):
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
count: int = Field(..., description="Number of embeddings generated")
cached_count: int = Field(
default=0,
description="Number of embeddings retrieved from cache"
)
cached_count: int = Field(default=0, description="Number of embeddings retrieved from cache")
class SkillEmbeddingRequest(BaseModel):
@@ -97,24 +78,15 @@ class SkillEmbeddingRequest(BaseModel):
"skill_path": "/path/to/skill/react",
"model": "text-embedding-3-small",
"chunk_size": 512,
"overlap": 50
"overlap": 50,
}
}
)
skill_path: str = Field(..., description="Path to skill directory")
model: str = Field(
default="text-embedding-3-small",
description="Embedding model to use"
)
chunk_size: int = Field(
default=512,
description="Chunk size for splitting documents (tokens)"
)
overlap: int = Field(
default=50,
description="Overlap between chunks (tokens)"
)
model: str = Field(default="text-embedding-3-small", description="Embedding model to use")
chunk_size: int = Field(default=512, description="Chunk size for splitting documents (tokens)")
overlap: int = Field(default=50, description="Overlap between chunks (tokens)")
class SkillEmbeddingResponse(BaseModel):
@@ -124,10 +96,7 @@ class SkillEmbeddingResponse(BaseModel):
total_chunks: int = Field(..., description="Total number of chunks embedded")
model: str = Field(..., description="Model used for generation")
dimensions: int = Field(..., description="Embedding dimensions")
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Skill metadata"
)
metadata: dict[str, Any] = Field(default_factory=dict, description="Skill metadata")
class HealthResponse(BaseModel):
@@ -144,12 +113,13 @@ class ModelInfo(BaseModel):
"""Information about an embedding model."""
name: str = Field(..., description="Model name")
provider: str = Field(..., description="Model provider (openai, anthropic, sentence-transformers)")
provider: str = Field(
..., description="Model provider (openai, anthropic, sentence-transformers)"
)
dimensions: int = Field(..., description="Embedding dimensions")
max_tokens: int = Field(..., description="Maximum input tokens")
cost_per_million: float | None = Field(
None,
description="Cost per million tokens (if applicable)"
None, description="Cost per million tokens (if applicable)"
)

View File

@@ -25,6 +25,7 @@ try:
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
FASTAPI_AVAILABLE = True
except ImportError:
FASTAPI_AVAILABLE = False
@@ -51,7 +52,7 @@ if FASTAPI_AVAILABLE:
description="Generate embeddings for text and skill content",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc"
redoc_url="/redoc",
)
# Add CORS middleware
@@ -64,13 +65,14 @@ if FASTAPI_AVAILABLE:
)
# Initialize generator and cache
cache_dir = os.getenv("EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings"))
cache_dir = os.getenv(
"EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings")
)
cache_db = os.path.join(cache_dir, "embeddings.db")
cache_enabled = os.getenv("EMBEDDING_CACHE_ENABLED", "true").lower() == "true"
generator = EmbeddingGenerator(
api_key=os.getenv("OPENAI_API_KEY"),
voyage_api_key=os.getenv("VOYAGE_API_KEY")
api_key=os.getenv("OPENAI_API_KEY"), voyage_api_key=os.getenv("VOYAGE_API_KEY")
)
cache = EmbeddingCache(cache_db) if cache_enabled else None
@@ -81,7 +83,7 @@ if FASTAPI_AVAILABLE:
"service": "Skill Seekers Embedding API",
"version": "1.0.0",
"docs": "/docs",
"health": "/health"
"health": "/health",
}
@app.get("/health", response_model=HealthResponse)
@@ -95,7 +97,7 @@ if FASTAPI_AVAILABLE:
version="1.0.0",
models=models,
cache_enabled=cache_enabled,
cache_size=cache_size
cache_size=cache_size,
)
@app.get("/models", response_model=ModelsResponse)
@@ -109,15 +111,12 @@ if FASTAPI_AVAILABLE:
provider=m["provider"],
dimensions=m["dimensions"],
max_tokens=m["max_tokens"],
cost_per_million=m.get("cost_per_million")
cost_per_million=m.get("cost_per_million"),
)
for m in models_list
]
return ModelsResponse(
models=model_infos,
count=len(model_infos)
)
return ModelsResponse(models=model_infos, count=len(model_infos))
@app.post("/embed", response_model=EmbeddingResponse)
async def embed_text(request: EmbeddingRequest):
@@ -144,9 +143,7 @@ if FASTAPI_AVAILABLE:
else:
# Generate embedding
embedding = generator.generate(
request.text,
model=request.model,
normalize=request.normalize
request.text, model=request.model, normalize=request.normalize
)
# Store in cache
@@ -154,10 +151,7 @@ if FASTAPI_AVAILABLE:
cache.set(hash_key, embedding, request.model)
return EmbeddingResponse(
embedding=embedding,
model=request.model,
dimensions=len(embedding),
cached=cached
embedding=embedding, model=request.model, dimensions=len(embedding), cached=cached
)
except Exception as e:
@@ -202,11 +196,13 @@ if FASTAPI_AVAILABLE:
texts_to_generate,
model=request.model,
normalize=request.normalize,
batch_size=request.batch_size
batch_size=request.batch_size,
)
# Fill in placeholders and cache
for idx, text, embedding in zip(text_indices, texts_to_generate, generated_embeddings, strict=False):
for idx, text, embedding in zip(
text_indices, texts_to_generate, generated_embeddings, strict=False
):
embeddings[idx] = embedding
if cache:
@@ -220,7 +216,7 @@ if FASTAPI_AVAILABLE:
model=request.model,
dimensions=dimensions,
count=len(embeddings),
cached_count=cached_count
cached_count=cached_count,
)
except Exception as e:
@@ -244,12 +240,16 @@ if FASTAPI_AVAILABLE:
skill_path = Path(request.skill_path)
if not skill_path.exists():
raise HTTPException(status_code=404, detail=f"Skill path not found: {request.skill_path}")
raise HTTPException(
status_code=404, detail=f"Skill path not found: {request.skill_path}"
)
# Read SKILL.md
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
raise HTTPException(status_code=404, detail=f"SKILL.md not found in {request.skill_path}")
raise HTTPException(
status_code=404, detail=f"SKILL.md not found in {request.skill_path}"
)
skill_content = skill_md.read_text()
@@ -262,10 +262,7 @@ if FASTAPI_AVAILABLE:
# Generate embeddings for chunks
embeddings, dimensions = generator.generate_batch(
chunks,
model=request.model,
normalize=True,
batch_size=32
chunks, model=request.model, normalize=True, batch_size=32
)
# TODO: Store embeddings in vector database
@@ -279,8 +276,8 @@ if FASTAPI_AVAILABLE:
metadata={
"skill_path": str(skill_path),
"chunks": len(chunks),
"content_length": len(skill_content)
}
"content_length": len(skill_content),
},
)
except HTTPException:
@@ -298,7 +295,7 @@ if FASTAPI_AVAILABLE:
@app.post("/cache/clear", response_model=dict)
async def clear_cache(
model: str | None = Query(None, description="Model to clear (all if not specified)")
model: str | None = Query(None, description="Model to clear (all if not specified)"),
):
"""Clear cache entries."""
if not cache:
@@ -306,11 +303,7 @@ if FASTAPI_AVAILABLE:
deleted = cache.clear(model=model)
return {
"status": "ok",
"deleted": deleted,
"model": model or "all"
}
return {"status": "ok", "deleted": deleted, "model": model or "all"}
@app.post("/cache/clear-expired", response_model=dict)
async def clear_expired():
@@ -320,10 +313,7 @@ if FASTAPI_AVAILABLE:
deleted = cache.clear_expired()
return {
"status": "ok",
"deleted": deleted
}
return {"status": "ok", "deleted": deleted}
else:
print("Error: FastAPI not available. Install with: pip install fastapi uvicorn")
@@ -348,12 +338,7 @@ def main():
if cache_enabled:
print(f"💾 Cache database: {cache_db}")
uvicorn.run(
"skill_seekers.embedding.server:app",
host=host,
port=port,
reload=reload
)
uvicorn.run("skill_seekers.embedding.server:app", host=host, port=port, reload=reload)
if __name__ == "__main__":

View File

@@ -69,15 +69,17 @@ async def generate_config(args: dict) -> list[TextContent]:
config = {
"name": name,
"description": description,
"sources": [{
"type": "documentation",
"base_url": url,
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
"rate_limit": rate_limit,
"max_pages": max_pages,
}],
"sources": [
{
"type": "documentation",
"base_url": url,
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
"rate_limit": rate_limit,
"max_pages": max_pages,
}
],
}
# Save to configs directory

View File

@@ -32,9 +32,9 @@ from .detector import ChangeDetector
from .models import SyncConfig, ChangeReport, PageChange
__all__ = [
'SyncMonitor',
'ChangeDetector',
'SyncConfig',
'ChangeReport',
'PageChange',
"SyncMonitor",
"ChangeDetector",
"SyncConfig",
"ChangeReport",
"PageChange",
]

View File

@@ -55,7 +55,7 @@ class ChangeDetector:
Returns:
Hexadecimal hash string
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest()
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def fetch_page(self, url: str) -> tuple[str, dict[str, str]]:
"""
@@ -72,17 +72,15 @@ class ChangeDetector:
requests.RequestException: If fetch fails
"""
response = requests.get(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
metadata = {
'last-modified': response.headers.get('Last-Modified'),
'etag': response.headers.get('ETag'),
'content-type': response.headers.get('Content-Type'),
'content-length': response.headers.get('Content-Length'),
"last-modified": response.headers.get("Last-Modified"),
"etag": response.headers.get("ETag"),
"content-type": response.headers.get("Content-Type"),
"content-length": response.headers.get("Content-Length"),
}
return response.text, metadata
@@ -92,7 +90,7 @@ class ChangeDetector:
url: str,
old_hash: str | None = None,
generate_diff: bool = False,
old_content: str | None = None
old_content: str | None = None,
) -> PageChange:
"""
Check if page has changed.
@@ -132,7 +130,7 @@ class ChangeDetector:
old_hash=old_hash,
new_hash=new_hash,
diff=diff,
detected_at=datetime.utcnow()
detected_at=datetime.utcnow(),
)
except requests.RequestException:
@@ -142,14 +140,11 @@ class ChangeDetector:
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
detected_at=datetime.utcnow(),
)
def check_pages(
self,
urls: list[str],
previous_hashes: dict[str, str],
generate_diffs: bool = False
self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False
) -> ChangeReport:
"""
Check multiple pages for changes.
@@ -185,13 +180,15 @@ class ChangeDetector:
# Check for deleted pages (in previous state but not in current)
for url, old_hash in previous_hashes.items():
if url not in checked_urls:
deleted.append(PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow()
))
deleted.append(
PageChange(
url=url,
change_type=ChangeType.DELETED,
old_hash=old_hash,
new_hash=None,
detected_at=datetime.utcnow(),
)
)
return ChangeReport(
skill_name="unknown", # To be set by caller
@@ -200,7 +197,7 @@ class ChangeDetector:
modified=modified,
deleted=deleted,
unchanged=unchanged_count,
checked_at=datetime.utcnow()
checked_at=datetime.utcnow(),
)
def generate_diff(self, old_content: str, new_content: str) -> str:
@@ -217,15 +214,9 @@ class ChangeDetector:
old_lines = old_content.splitlines(keepends=True)
new_lines = new_content.splitlines(keepends=True)
diff = difflib.unified_diff(
old_lines,
new_lines,
fromfile='old',
tofile='new',
lineterm=''
)
diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="")
return ''.join(diff)
return "".join(diff)
def generate_summary_diff(self, old_content: str, new_content: str) -> str:
"""
@@ -244,16 +235,15 @@ class ChangeDetector:
diff = difflib.unified_diff(old_lines, new_lines)
diff_lines = list(diff)
added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++'))
removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---'))
added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++"))
removed = sum(
1 for line in diff_lines if line.startswith("-") and not line.startswith("---")
)
return f"+{added} -{removed} lines"
def check_header_changes(
self,
url: str,
old_modified: str | None = None,
old_etag: str | None = None
self, url: str, old_modified: str | None = None, old_etag: str | None = None
) -> bool:
"""
Quick check using HTTP headers (no content download).
@@ -269,14 +259,12 @@ class ChangeDetector:
try:
# Use HEAD request for efficiency
response = requests.head(
url,
timeout=self.timeout,
headers={'User-Agent': 'SkillSeekers-Sync/1.0'}
url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"}
)
response.raise_for_status()
new_modified = response.headers.get('Last-Modified')
new_etag = response.headers.get('ETag')
new_modified = response.headers.get("Last-Modified")
new_etag = response.headers.get("ETag")
# Check if headers indicate change
if old_modified and new_modified and old_modified != new_modified:
@@ -289,9 +277,7 @@ class ChangeDetector:
return True
def batch_check_headers(
self,
urls: list[str],
previous_metadata: dict[str, dict[str, str]]
self, urls: list[str], previous_metadata: dict[str, dict[str, str]]
) -> list[str]:
"""
Batch check URLs using headers only.
@@ -307,8 +293,8 @@ class ChangeDetector:
for url in urls:
old_meta = previous_metadata.get(url, {})
old_modified = old_meta.get('last-modified')
old_etag = old_meta.get('etag')
old_modified = old_meta.get("last-modified")
old_etag = old_meta.get("etag")
if self.check_header_changes(url, old_modified, old_etag):
changed_urls.append(url)

View File

@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
class ChangeType(str, Enum):
"""Type of change detected."""
ADDED = "added"
MODIFIED = "modified"
DELETED = "deleted"
@@ -25,8 +26,7 @@ class PageChange(BaseModel):
new_hash: str | None = Field(None, description="New content hash")
diff: str | None = Field(None, description="Content diff (if available)")
detected_at: datetime = Field(
default_factory=datetime.utcnow,
description="When change was detected"
default_factory=datetime.utcnow, description="When change was detected"
)
class Config:
@@ -37,7 +37,7 @@ class PageChange(BaseModel):
"old_hash": "abc123",
"new_hash": "def456",
"diff": "@@ -10,3 +10,4 @@\n+New content here",
"detected_at": "2024-01-15T10:30:00Z"
"detected_at": "2024-01-15T10:30:00Z",
}
}
@@ -52,8 +52,7 @@ class ChangeReport(BaseModel):
deleted: list[PageChange] = Field(default_factory=list, description="Deleted pages")
unchanged: int = Field(0, description="Number of unchanged pages")
checked_at: datetime = Field(
default_factory=datetime.utcnow,
description="When check was performed"
default_factory=datetime.utcnow, description="When check was performed"
)
@property
@@ -72,34 +71,19 @@ class SyncConfig(BaseModel):
skill_config: str = Field(..., description="Path to skill config file")
check_interval: int = Field(
default=3600,
description="Check interval in seconds (default: 1 hour)"
default=3600, description="Check interval in seconds (default: 1 hour)"
)
enabled: bool = Field(default=True, description="Whether sync is enabled")
auto_update: bool = Field(
default=False,
description="Automatically rebuild skill on changes"
)
notify_on_change: bool = Field(
default=True,
description="Send notifications on changes"
)
auto_update: bool = Field(default=False, description="Automatically rebuild skill on changes")
notify_on_change: bool = Field(default=True, description="Send notifications on changes")
notification_channels: list[str] = Field(
default_factory=list,
description="Notification channels (email, slack, webhook)"
)
webhook_url: str | None = Field(
None,
description="Webhook URL for change notifications"
default_factory=list, description="Notification channels (email, slack, webhook)"
)
webhook_url: str | None = Field(None, description="Webhook URL for change notifications")
email_recipients: list[str] = Field(
default_factory=list,
description="Email recipients for notifications"
)
slack_webhook: str | None = Field(
None,
description="Slack webhook URL"
default_factory=list, description="Email recipients for notifications"
)
slack_webhook: str | None = Field(None, description="Slack webhook URL")
class Config:
json_schema_extra = {
@@ -111,7 +95,7 @@ class SyncConfig(BaseModel):
"notify_on_change": True,
"notification_channels": ["slack", "webhook"],
"webhook_url": "https://example.com/webhook",
"slack_webhook": "https://hooks.slack.com/services/..."
"slack_webhook": "https://hooks.slack.com/services/...",
}
}
@@ -125,8 +109,7 @@ class SyncState(BaseModel):
total_checks: int = Field(default=0, description="Total checks performed")
total_changes: int = Field(default=0, description="Total changes detected")
page_hashes: dict[str, str] = Field(
default_factory=dict,
description="URL -> content hash mapping"
default_factory=dict, description="URL -> content hash mapping"
)
status: str = Field(default="idle", description="Current status")
error: str | None = Field(None, description="Last error message")
@@ -137,15 +120,9 @@ class WebhookPayload(BaseModel):
event: str = Field(..., description="Event type (change_detected, sync_complete)")
skill_name: str = Field(..., description="Skill name")
timestamp: datetime = Field(
default_factory=datetime.utcnow,
description="Event timestamp"
)
timestamp: datetime = Field(default_factory=datetime.utcnow, description="Event timestamp")
changes: ChangeReport | None = Field(None, description="Change report")
metadata: dict[str, Any] = Field(
default_factory=dict,
description="Additional metadata"
)
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
class Config:
json_schema_extra = {
@@ -157,8 +134,8 @@ class WebhookPayload(BaseModel):
"total_pages": 150,
"added": [],
"modified": [{"url": "https://react.dev/learn"}],
"deleted": []
"deleted": [],
},
"metadata": {"source": "periodic_check"}
"metadata": {"source": "periodic_check"},
}
}

View File

@@ -51,7 +51,7 @@ class SyncMonitor:
check_interval: int = 3600,
auto_update: bool = False,
state_file: str | None = None,
on_change: Callable[[ChangeReport], None] | None = None
on_change: Callable[[ChangeReport], None] | None = None,
):
"""
Initialize sync monitor.
@@ -72,7 +72,7 @@ class SyncMonitor:
with open(self.config_path) as f:
self.skill_config = json.load(f)
self.skill_name = self.skill_config.get('name', 'unknown')
self.skill_name = self.skill_config.get("name", "unknown")
# State file
if state_file:
@@ -97,10 +97,10 @@ class SyncMonitor:
with open(self.state_file) as f:
data = json.load(f)
# Convert datetime strings back
if data.get('last_check'):
data['last_check'] = datetime.fromisoformat(data['last_check'])
if data.get('last_change'):
data['last_change'] = datetime.fromisoformat(data['last_change'])
if data.get("last_check"):
data["last_check"] = datetime.fromisoformat(data["last_check"])
if data.get("last_change"):
data["last_change"] = datetime.fromisoformat(data["last_change"])
return SyncState(**data)
else:
return SyncState(skill_name=self.skill_name)
@@ -109,12 +109,12 @@ class SyncMonitor:
"""Save current state to file."""
# Convert datetime to ISO format
data = self.state.dict()
if data.get('last_check'):
data['last_check'] = data['last_check'].isoformat()
if data.get('last_change'):
data['last_change'] = data['last_change'].isoformat()
if data.get("last_check"):
data["last_check"] = data["last_check"].isoformat()
if data.get("last_change"):
data["last_change"] = data["last_change"].isoformat()
with open(self.state_file, 'w') as f:
with open(self.state_file, "w") as f:
json.dump(data, f, indent=2)
def check_now(self, generate_diffs: bool = False) -> ChangeReport:
@@ -132,7 +132,7 @@ class SyncMonitor:
try:
# Get URLs to check from config
base_url = self.skill_config.get('base_url')
base_url = self.skill_config.get("base_url")
# TODO: In real implementation, get actual URLs from scraper
# For now, simulate with base URL only
@@ -140,9 +140,7 @@ class SyncMonitor:
# Check for changes
report = self.detector.check_pages(
urls=urls,
previous_hashes=self.state.page_hashes,
generate_diffs=generate_diffs
urls=urls, previous_hashes=self.state.page_hashes, generate_diffs=generate_diffs
)
report.skill_name = self.skill_name
@@ -192,7 +190,7 @@ class SyncMonitor:
event="change_detected",
skill_name=self.skill_name,
changes=report,
metadata={"auto_update": self.auto_update}
metadata={"auto_update": self.auto_update},
)
self.notifier.send(payload)
@@ -214,9 +212,7 @@ class SyncMonitor:
self._running = True
# Schedule checks
schedule.every(self.check_interval).seconds.do(
lambda: self.check_now()
)
schedule.every(self.check_interval).seconds.do(lambda: self.check_now())
# Run in thread
def run_schedule():

View File

@@ -34,7 +34,7 @@ class Notifier:
webhook_url: str | None = None,
slack_webhook: str | None = None,
email_recipients: list[str] | None = None,
console: bool = True
console: bool = True,
):
"""
Initialize notifier.
@@ -45,8 +45,8 @@ class Notifier:
email_recipients: List of email recipients
console: Whether to print to console
"""
self.webhook_url = webhook_url or os.getenv('SYNC_WEBHOOK_URL')
self.slack_webhook = slack_webhook or os.getenv('SLACK_WEBHOOK_URL')
self.webhook_url = webhook_url or os.getenv("SYNC_WEBHOOK_URL")
self.slack_webhook = slack_webhook or os.getenv("SLACK_WEBHOOK_URL")
self.email_recipients = email_recipients or []
self.console = console
@@ -92,8 +92,8 @@ class Notifier:
response = requests.post(
self.webhook_url,
json=payload.dict(),
headers={'Content-Type': 'application/json'},
timeout=10
headers={"Content-Type": "application/json"},
timeout=10,
)
response.raise_for_status()
print(f"✅ Webhook notification sent to {self.webhook_url}")
@@ -124,14 +124,10 @@ class Notifier:
slack_payload = {
"text": text,
"username": "Skill Seekers Sync",
"icon_emoji": ":books:"
"icon_emoji": ":books:",
}
response = requests.post(
self.slack_webhook,
json=slack_payload,
timeout=10
)
response = requests.post(self.slack_webhook, json=slack_payload, timeout=10)
response.raise_for_status()
print("✅ Slack notification sent")
except Exception as e: